1 // $Id: AnselToUnicode.java,v 1.5 2008/10/17 06:47:06 haschart Exp $
\r
3 * Copyright (C) 2002 Bas Peters (mail@bpeters.com)
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
21 package org.marc4j.converter.impl;
\r
23 import java.io.InputStream;
\r
24 import java.lang.reflect.Constructor;
\r
25 import java.util.Vector;
\r
27 import org.marc4j.ErrorHandler;
\r
28 import org.marc4j.MarcException;
\r
29 import org.marc4j.converter.CharConverter;
\r
33 * A utility to convert MARC-8 data to non-precomposed UCS/Unicode.
\r
37 * The MARC-8 to Unicode mapping used is the version with the March 2005
\r
41 * @author Bas Peters
\r
42 * @author Corey Keith
\r
43 * @version $Revision: 1.5 $
\r
45 public class AnselToUnicode extends CharConverter {
\r
47 @SuppressWarnings("rawtypes")
\r
48 class Queue extends Vector {
\r
53 private static final long serialVersionUID = 7414878465947143461L;
\r
56 * Puts an item into the queue.
\r
59 * the item to be put into the queue.
\r
61 @SuppressWarnings("unchecked")
\r
62 public Object put(Object item) {
\r
69 * Gets an item from the front of the queue.
\r
71 public Object get() {
\r
73 @SuppressWarnings("unused")
\r
83 * Peeks at the front of the queue.
\r
85 public Object peek() {
\r
86 @SuppressWarnings("unused")
\r
89 return elementAt(0);
\r
93 * Returns true if the queue is empty.
\r
95 public boolean empty() {
\r
100 class CodeTracker {
\r
109 public String toString() {
\r
110 return "Offset: " + offset + " G0: " + Integer.toHexString(g0)
\r
111 + " G1: " + Integer.toHexString(g1) + " Multibyte: "
\r
116 protected CodeTableInterface ct;
\r
118 protected boolean loadedMultibyte = false;
\r
120 protected ErrorHandler errorList = null;
\r
122 * Creates a new instance and loads the MARC4J supplied
\r
123 * conversion tables based on the official LC tables.
\r
126 public AnselToUnicode()
\r
128 ct = loadGeneratedTable(false);
\r
132 * Creates a new instance and loads the MARC4J supplied
\r
133 * conversion tables based on the official LC tables.
\r
136 public AnselToUnicode(boolean loadMultibyte)
\r
138 ct = loadGeneratedTable(loadMultibyte);
\r
141 * Creates a new instance and loads the MARC4J supplied
\r
142 * conversion tables based on the official LC tables.
\r
145 public AnselToUnicode(ErrorHandler errorList)
\r
147 ct = loadGeneratedTable(false);
\r
148 this.errorList = errorList;
\r
152 * Creates a new instance and loads the MARC4J supplied
\r
153 * conversion tables based on the official LC tables.
\r
156 public AnselToUnicode(ErrorHandler errorList, boolean loadMultibyte)
\r
158 ct = loadGeneratedTable(loadMultibyte);
\r
159 this.errorList = errorList;
\r
163 @SuppressWarnings({ "rawtypes", "unchecked" })
\r
164 private CodeTableInterface loadGeneratedTable(boolean loadMultibyte)
\r
168 Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated");
\r
169 Constructor cons = generated.getConstructor();
\r
170 Object ct = cons.newInstance();
\r
171 loadedMultibyte = true;
\r
172 return((CodeTableInterface)ct);
\r
174 catch (Exception e)
\r
176 CodeTableInterface ct;
\r
179 ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml"));
\r
183 ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml"));
\r
185 loadedMultibyte = loadMultibyte;
\r
192 * Constructs an instance with the specified pathname.
\r
194 * Use this constructor to create an instance with a customized code table
\r
195 * mapping. The mapping file should follow the structure of LC's XML MARC-8
\r
196 * to Unicode mapping (see:
\r
197 * http://www.loc.gov/marc/specifications/codetables.xml).
\r
200 public AnselToUnicode(String pathname) {
\r
201 ct = new CodeTable(pathname);
\r
202 loadedMultibyte = true;
\r
206 * Constructs an instance with the specified input stream.
\r
208 * Use this constructor to create an instance with a customized code table
\r
209 * mapping. The mapping file should follow the structure of LC's XML MARC-8
\r
210 * to Unicode mapping (see:
\r
211 * http://www.loc.gov/marc/specifications/codetables.xml).
\r
214 public AnselToUnicode(InputStream in) {
\r
215 ct = new CodeTable(in);
\r
216 loadedMultibyte = true;
\r
220 * Loads the entire mapping (including multibyte characters) from the Library
\r
223 private void loadMultibyte() {
\r
224 ct = new CodeTable(getClass().getResourceAsStream(
\r
225 "resources/codetables.xml"));
\r
228 private void checkMode(char[] data, CodeTracker cdt) {
\r
231 @SuppressWarnings("unused")
\r
233 while (cdt.offset + extra + extra2< data.length && isEscape(data[cdt.offset])) {
\r
234 switch (data[cdt.offset + 1 + extra]) {
\r
237 set_cdt(cdt, 0, data, 2 + extra, false);
\r
241 set_cdt(cdt, 1, data, 2 + extra, false);
\r
244 if (!loadedMultibyte) {
\r
246 loadedMultibyte = true;
\r
248 switch (data[cdt.offset + 2 + extra + extra2]) {
\r
251 set_cdt(cdt, 1, data, 3 + extra + extra2, true);
\r
254 set_cdt(cdt, 0, data, 3 + extra + extra2, true);
\r
257 cdt.g0 = data[cdt.offset + 2 + extra + extra2];
\r
258 cdt.offset += 3 + extra + extra2;
\r
259 cdt.multibyte = true;
\r
262 // space found in escape code: look ahead and try to proceed
\r
266 // unknown code character found: discard escape sequence and return
\r
268 if (errorList != null)
\r
270 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
274 throw new MarcException("Unknown character set code found following escape character.");
\r
282 cdt.g0 = data[cdt.offset + 1 + extra];
\r
283 cdt.offset += 2 + extra;
\r
284 cdt.multibyte = false;
\r
288 cdt.offset += 2 + extra;
\r
289 cdt.multibyte = false;
\r
292 // space found in escape code: look ahead and try to proceed
\r
293 if (errorList == null)
\r
295 throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
\r
300 // unknown code character found: discard escape sequence and return
\r
302 if (errorList != null)
\r
304 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
308 throw new MarcException("Unknown character set code found following escape character.");
\r
313 if (errorList != null && ( extra != 0 || extra2 != 0))
\r
315 errorList.addError(ErrorHandler.ERROR_TYPO, "" + (extra+extra2) + " extraneous space characters found within MARC8 character set escape sequence");
\r
319 private void set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte)
\r
321 if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E')
\r
325 else if (data[cdt.offset + addnlOffset] == ' ')
\r
327 if (errorList != null)
\r
329 errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space.");
\r
333 throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
\r
337 else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1)
\r
339 if (errorList != null)
\r
341 errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character.");
\r
345 throw new MarcException("Extraneaous intermediate character found following escape character.");
\r
349 if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1)
\r
352 cdt.multibyte = false;
\r
353 if (errorList != null)
\r
355 errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
\r
359 throw new MarcException("Unknown character set code found following escape character.");
\r
362 else // All is well, proceed normally
\r
364 if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset];
\r
365 else cdt.g1 = data[cdt.offset + addnlOffset];
\r
366 cdt.offset += 1 + addnlOffset;
\r
367 cdt.multibyte = multibyte;
\r
372 * Converts MARC-8 data to UCS/Unicode.
\r
375 * @param data - the MARC-8 data in an array of char
\r
376 * @return String - the UCS/Unicode data
\r
378 public String convert(char data[])
\r
380 StringBuffer sb = new StringBuffer();
\r
381 int len = data.length;
\r
383 CodeTracker cdt = new CodeTracker();
\r
387 cdt.multibyte = false;
\r
391 checkMode(data, cdt);
\r
393 Queue diacritics = new Queue();
\r
395 while (cdt.offset < data.length)
\r
397 if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
\r
398 && hasNext(cdt.offset, len))
\r
401 while (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
\r
402 && hasNext(cdt.offset, len))
\r
404 char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
405 if (c != 0) diacritics.put(new Character(c));
\r
407 checkMode(data, cdt);
\r
410 char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
412 checkMode(data, cdt);
\r
413 if (c2 != 0) sb.append(c2);
\r
415 while (!diacritics.isEmpty())
\r
417 char c1 = ((Character) diacritics.get()).charValue();
\r
422 else if (cdt.multibyte)
\r
424 if (data[cdt.offset]== 0x20)
\r
426 // if a 0x20 byte occurs amidst a sequence of multibyte characters
\r
427 // skip over it and output a space.
\r
428 // Hmmm. If the following line is present it seems to output two spaces
\r
429 // when a space occurs in multibytes chars, without it one seems to be output.
\r
430 // sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1));
\r
433 else if (cdt.offset + 3 <= data.length && (errorList == null || data[cdt.offset+1]!= 0x20 && data[cdt.offset+2]!= 0x20))
\r
435 char c = getMBChar(makeMultibyte(data[cdt.offset], data[cdt.offset+1], data[cdt.offset+2]));
\r
436 if (errorList == null || c != 0)
\r
441 else if (cdt.offset + 6 <= data.length && data[cdt.offset+4]!= 0x20 && data[cdt.offset+5]!= 0x20 &&
\r
442 getMBChar(makeMultibyte(data[cdt.offset+3], data[cdt.offset+4], data[cdt.offset+5])) != 0)
\r
444 if (errorList != null)
\r
446 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
\r
451 else if (cdt.offset + 4 <= data.length && data[cdt.offset] > 0x7f &&
\r
452 getMBChar(makeMultibyte(data[cdt.offset+1], data[cdt.offset+2], data[cdt.offset+3])) != 0)
\r
454 if (errorList != null)
\r
456 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters");
\r
457 sb.append(getChar(data[cdt.offset], 0x42, 0x45));
\r
463 if (errorList != null)
\r
465 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
\r
467 cdt.multibyte = false;
\r
472 else if (errorList != null && cdt.offset + 4 <= data.length && ( data[cdt.offset+1] == 0x20 || data[cdt.offset+2]== 0x20))
\r
474 int multiByte = makeMultibyte( data[cdt.offset], ((data[cdt.offset+1] != 0x20)? data[cdt.offset+1] : data[cdt.offset+2]), data[cdt.offset+3]);
\r
475 char c = getMBChar(multiByte);
\r
478 if (errorList != null)
\r
480 errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character");
\r
488 if (errorList != null)
\r
490 errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
\r
492 cdt.multibyte = false;
\r
497 else if (cdt.offset + 3 > data.length)
\r
499 if (errorList != null)
\r
501 errorList.addError(ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set");
\r
502 cdt.multibyte = false;
\r
506 // if a field ends with an incomplete encoding of a multibyte character
\r
507 // simply discard that final partial character.
\r
516 char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
\r
517 if (c != 0) sb.append(c);
\r
520 String val = "0000"+Integer.toHexString((int)(data[cdt.offset]));
\r
521 sb.append("<U+"+ (val.substring(val.length()-4, val.length()))+ ">" );
\r
525 if (hasNext(cdt.offset, len))
\r
527 checkMode(data, cdt);
\r
530 return sb.toString();
\r
533 @SuppressWarnings("unused")
\r
534 private int makeMultibyte(char[] data) {
\r
535 int[] chars = new int[3];
\r
536 chars[0] = data[0] << 16;
\r
537 chars[1] = data[1] << 8;
\r
538 chars[2] = data[2];
\r
539 return chars[0] | chars[1] | chars[2];
\r
542 public int makeMultibyte(char c1, char c2, char c3)
\r
544 int[] chars = new int[3];
\r
545 chars[0] = c1 << 16;
\r
546 chars[1] = c2 << 8;
\r
548 return chars[0] | chars[1] | chars[2];
\r
551 private char getChar(int ch, int g0, int g1) {
\r
553 return ct.getChar(ch, g0);
\r
555 return ct.getChar(ch, g1);
\r
558 public char getMBChar(int ch) {
\r
559 return ct.getChar(ch, 0x31);
\r
562 private static boolean hasNext(int pos, int len) {
\r
563 if (pos < (len - 1))
\r
568 private static boolean isEscape(int i) {
\r