2 * Copyright (C) 2004 Bas Peters
\r
4 * This file is part of MARC4J
\r
6 * MARC4J is free software; you can redistribute it and/or
\r
7 * modify it under the terms of the GNU Lesser General Public
\r
8 * License as published by the Free Software Foundation; either
\r
9 * version 2.1 of the License, or (at your option) any later version.
\r
11 * MARC4J is distributed in the hope that it will be useful,
\r
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
14 * Lesser General Public License for more details.
\r
16 * You should have received a copy of the GNU Lesser General Public
\r
17 * License along with MARC4J; if not, write to the Free Software
\r
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
22 import java.io.BufferedInputStream;
\r
23 import java.io.ByteArrayInputStream;
\r
24 import java.io.DataInputStream;
\r
25 import java.io.EOFException;
\r
26 import java.io.IOException;
\r
27 import java.io.InputStream;
\r
28 import java.io.InputStreamReader;
\r
29 import java.io.UnsupportedEncodingException;
\r
30 import java.util.HashMap;
\r
31 import java.util.Iterator;
\r
32 import java.util.List;
\r
33 import java.util.regex.Matcher;
\r
34 import java.util.regex.Pattern;
\r
36 import org.marc4j.Constants;
\r
37 import org.marc4j.MarcException;
\r
38 import org.marc4j.MarcReader;
\r
39 import org.marc4j.converter.CharConverter;
\r
40 import org.marc4j.converter.impl.AnselToUnicode;
\r
41 import org.marc4j.converter.impl.Iso5426ToUnicode;
\r
42 import org.marc4j.marc.ControlField;
\r
43 import org.marc4j.marc.DataField;
\r
44 import org.marc4j.marc.Leader;
\r
45 import org.marc4j.marc.MarcFactory;
\r
46 import org.marc4j.marc.Record;
\r
47 import org.marc4j.marc.Subfield;
\r
48 import org.marc4j.marc.VariableField;
\r
49 import org.marc4j.marc.impl.Verifier;
\r
51 import com.ibm.icu.text.Normalizer;
\r
54 * An iterator over a collection of MARC records in ISO 2709 format, that is designed
\r
55 * to be able to handle MARC records that have errors in their structure or their encoding.
\r
56 * If the permissive flag is set in the call to the constructor, or if a ErrorHandler object
\r
57 * is passed in as a parameter to the constructor, this reader will do its best to detect
\r
58 * and recover from a number of structural or encoding errors that can occur in a MARC record.
\r
59 * Note that if this reader is not set to read permissively, its will operate pretty much
\r
60 * identically to the MarcStreamReader class.
\r
62 * Note that no attempt is made to validate the contents of the record at a semantic level.
\r
63 * This reader does not know and does not care whether the record has a 245 field, or if the
\r
64 * 008 field is the right length, but if the record claims to be UTF-8 or MARC8 encoded and
\r
65 * you are seeing gibberish in the output, or if the reader is throwing an exception in trying
\r
66 * to read a record, then this reader may be able to produce a usable record from the bad
\r
69 * The ability to directly translate the record to UTF-8 as it is being read in is useful in
\r
70 * cases where the UTF-8 version of the record will be used directly by the program that is
\r
71 * reading the MARC data, for instance if the marc records are to be indexed into a SOLR search
\r
72 * engine. Previously the MARC record could only be translated to UTF-8 as it was being written
\r
73 * out via a MarcStreamWriter or a MarcXmlWriter.
\r
79 * InputStream input = new FileInputStream("file.mrc");
\r
80 * MarcReader reader = new MarcPermissiveStreamReader(input, true, true);
\r
81 * while (reader.hasNext()) {
\r
82 * Record record = reader.next();
\r
88 * Check the {@link org.marc4j.marc} package for examples about the use of
\r
89 * the {@link org.marc4j.marc.Record} object model.
\r
90 * Check the file org.marc4j.samples.PermissiveReaderExample.java for an
\r
91 * example about using the MarcPermissiveStreamReader in conjunction with the
\r
92 * ErrorHandler class to report errors encountered while processing records.
\r
96 * When no encoding is given as an constructor argument the parser tries to
\r
97 * resolve the encoding by looking at the character coding scheme (leader
\r
98 * position 9) in MARC21 records. For UNIMARC records this position is not
\r
99 * defined. If the reader is operating in permissive mode and no encoding
\r
100 * is given as an constructor argument the reader will look at the leader,
\r
101 * and also at the data of the record to determine to the best of its ability
\r
102 * what character encoding scheme has been used to encode the data in a
\r
103 * particular MARC record.
\r
107 * @author Robert Haschart
\r
108 * @version $Revision: 1.3 $
\r
111 public class MarcPermissiveStreamReader implements MarcReader {
\r
113 private DataInputStream input = null;
\r
115 private Record record;
\r
117 private MarcFactory factory;
\r
119 private String encoding = "ISO8859_1";
\r
121 // This represents the expected encoding of the data when a
\r
122 // MARC record does not have a 'a' in character 9 of the leader.
\r
123 private String defaultEncoding = "ISO8859_1";
\r
125 private boolean convertToUTF8 = false;
\r
127 private boolean permissive = false;
\r
129 private CharConverter converterAnsel = null;
\r
131 private CharConverter converterUnimarc = null;
\r
133 // These are used to algorithmically determine what encoding scheme was
\r
134 // used to encode the data in the Marc record
\r
135 private String conversionCheck1 = null;
\r
136 private String conversionCheck2 = null;
\r
137 private String conversionCheck3 = null;
\r
139 private ErrorHandler errors;
\r
142 * Constructs an instance with the specified input stream with possible additional functionality
\r
143 * being enabled by setting permissive and/or convertToUTF8 to true.
\r
145 * If permissive and convertToUTF8 are both set to false, it functions almost identically to the
\r
146 * MarcStreamReader class.
\r
148 public MarcPermissiveStreamReader(InputStream input, boolean permissive, boolean convertToUTF8) {
\r
149 this.permissive = permissive;
\r
150 this.input = new DataInputStream(new BufferedInputStream(input));
\r
151 factory = MarcFactory.newInstance();
\r
152 this.convertToUTF8 = convertToUTF8;
\r
156 errors = new ErrorHandler();
\r
157 defaultEncoding = "BESTGUESS";
\r
162 * Constructs an instance with the specified input stream with possible additional functionality
\r
163 * being enabled by passing in an ErrorHandler object and/or setting convertToUTF8 to true.
\r
165 * If errors and convertToUTF8 are both set to false, it functions almost identically to the
\r
166 * MarcStreamReader class.
\r
168 * If an ErrorHandler object is passed in, that object will be used to log and track any errors
\r
169 * in the records as the records are decoded. After the next() function returns, you can query
\r
170 * to determine whether any errors were detected in the decoding process.
\r
172 * See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
\r
174 public MarcPermissiveStreamReader(InputStream input, ErrorHandler errors, boolean convertToUTF8 )
\r
176 if (errors != null)
\r
179 defaultEncoding = "BESTGUESS";
\r
181 this.input = new DataInputStream(new BufferedInputStream(input));
\r
182 factory = MarcFactory.newInstance();
\r
183 this.convertToUTF8 = convertToUTF8;
\r
184 this.errors = errors;
\r
188 * Constructs an instance with the specified input stream with possible additional functionality
\r
189 * being enabled by setting permissive and/or convertToUTF8 to true.
\r
191 * If permissive and convertToUTF8 are both set to false, it functions almost identically to the
\r
192 * MarcStreamReader class.
\r
194 * The parameter defaultEncoding is used to specify the character encoding that is used in the records
\r
195 * that will be read from the input stream. If permissive is set to true, you can specify "BESTGUESS"
\r
196 * as the default encoding, and the reader will attempt to determine the character encoding used in the
\r
197 * records being read from the input stream. This is especially useful if you are working with records
\r
198 * downloaded from an external source and the encoding is either unknown or the encoding is different from
\r
199 * what the records claim to be.
\r
201 public MarcPermissiveStreamReader(InputStream input, boolean permissive, boolean convertToUTF8, String defaultEncoding)
\r
203 this.permissive = permissive;
\r
204 this.input = new DataInputStream(new BufferedInputStream(input));
\r
205 factory = MarcFactory.newInstance();
\r
206 this.convertToUTF8 = convertToUTF8;
\r
207 this.defaultEncoding = defaultEncoding;
\r
209 if (permissive) errors = new ErrorHandler();
\r
213 * Constructs an instance with the specified input stream with possible additional functionality
\r
214 * being enabled by setting permissive and/or convertToUTF8 to true.
\r
216 * If errors and convertToUTF8 are both set to false, it functions almost identically to the
\r
217 * MarcStreamReader class.
\r
219 * The parameter defaultEncoding is used to specify the character encoding that is used in the records
\r
220 * that will be read from the input stream. If permissive is set to true, you can specify "BESTGUESS"
\r
221 * as the default encoding, and the reader will attempt to determine the character encoding used in the
\r
222 * records being read from the input stream. This is especially useful if you are working with records
\r
223 * downloaded from an external source and the encoding is either unknown or the encoding is different from
\r
224 * what the records claim to be.
\r
226 * If an ErrorHandler object is passed in, that object will be used to log and track any errors
\r
227 * in the records as the records are decoded. After the next() function returns, you can query
\r
228 * to determine whether any errors were detected in the decoding process.
\r
230 * See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
\r
232 public MarcPermissiveStreamReader(InputStream input, ErrorHandler errors, boolean convertToUTF8, String defaultEncoding)
\r
234 this.permissive = true;
\r
235 this.input = new DataInputStream(new BufferedInputStream(input));
\r
236 factory = MarcFactory.newInstance();
\r
237 this.convertToUTF8 = convertToUTF8;
\r
238 this.defaultEncoding = defaultEncoding;
\r
239 this.errors = errors;
\r
243 * Returns true if the iteration has more records, false otherwise.
\r
245 public boolean hasNext()
\r
248 if (input.available() == 0)
\r
250 } catch (IOException e) {
\r
251 throw new MarcException(e.getMessage(), e);
\r
257 * Returns the next record in the iteration.
\r
259 * @return Record - the record object
\r
261 public Record next()
\r
263 record = factory.newRecord();
\r
264 if (errors != null) errors.reset();
\r
267 byte[] byteArray = new byte[24];
\r
268 input.readFully(byteArray);
\r
270 int recordLength = parseRecordLength(byteArray);
\r
271 byte[] recordBuf = new byte[recordLength - 24];
\r
274 input.mark(recordLength * 2);
\r
275 input.readFully(recordBuf);
\r
276 if (recordBuf[recordBuf.length-1] != Constants.RT)
\r
278 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
279 "Record terminator character not found at end of record length");
\r
280 recordBuf = rereadPermissively(input, recordBuf, recordLength);
\r
281 recordLength = recordBuf.length + 24;
\r
286 input.readFully(recordBuf);
\r
288 String tmp = new String(recordBuf);
\r
289 parseRecord(record, byteArray, recordBuf, recordLength);
\r
291 if (this.convertToUTF8)
\r
293 Leader l = record.getLeader();
\r
294 l.setCharCodingScheme('a');
\r
295 record.setLeader(l);
\r
299 catch (EOFException e) {
\r
300 throw new MarcException("Premature end of file encountered", e);
\r
302 catch (IOException e) {
\r
303 throw new MarcException("an error occured reading input", e);
\r
307 private byte[] rereadPermissively(DataInputStream input, byte[] recordBuf, int recordLength) throws IOException
\r
309 int loc = arrayContainsAt(recordBuf, Constants.RT);
\r
310 if (loc != -1) // stated record length is too long
\r
312 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
313 "Record terminator appears before stated record length, using shorter record");
\r
314 recordLength = loc + 24;
\r
316 recordBuf = new byte[recordLength - 24];
\r
317 input.readFully(recordBuf);
\r
319 else // stated record length is too short read ahead
\r
321 loc = recordLength - 24;
\r
327 } while (loc < recordLength + 100 && c != Constants.RT && c != -1);
\r
329 if (c == Constants.RT)
\r
331 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
332 "Record terminator appears after stated record length, reading extra bytes");
\r
333 recordLength = loc + 24;
\r
335 recordBuf = new byte[recordLength - 24];
\r
336 input.readFully(recordBuf);
\r
340 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
341 "No Record terminator found, end of file reached, Terminator appended");
\r
342 recordLength = loc + 24;
\r
344 recordBuf = new byte[recordLength - 24 + 1];
\r
345 input.readFully(recordBuf);
\r
346 recordBuf[recordBuf.length-1] = Constants.RT;
\r
350 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
351 "No Record terminator found within 100 byts of stated location, giving up.");
\r
357 private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
\r
360 ldr = factory.newLeader();
\r
361 ldr.setRecordLength(recordLength);
\r
362 int directoryLength=0;
\r
363 // These variables are used when the permissive reader is trying to make its best guess
\r
364 // as to what character encoding is actually used in the record being processed.
\r
365 conversionCheck1 = "";
\r
366 conversionCheck2 = "";
\r
367 conversionCheck3 = "";
\r
370 parseLeader(ldr, byteArray);
\r
371 directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
\r
373 catch (IOException e) {
\r
374 throw new MarcException("error parsing leader with data: "
\r
375 + new String(byteArray), e);
\r
377 catch (MarcException e) {
\r
380 if (recordBuf[recordBuf.length-1] == Constants.RT && recordBuf[recordBuf.length-2] == Constants.FT)
\r
382 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
383 "Error parsing leader, trying to re-read leader either shorter or longer");
\r
384 // make an attempt to recover record.
\r
386 while (offset < recordBuf.length)
\r
388 if (recordBuf[offset] == Constants.FT)
\r
394 if (offset % 12 == 1)
\r
396 // move one byte from body to leader, make new leader, and try again
\r
397 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
398 "Leader appears to be too short, moving one byte from record body to leader, and trying again");
\r
399 byte oldBody[] = recordBuf;
\r
400 recordBuf = new byte[oldBody.length-1];
\r
401 System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length-1);
\r
402 directoryLength = offset-1;
\r
403 ldr.setIndicatorCount(2);
\r
404 ldr.setSubfieldCodeLength(2);
\r
405 ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
\r
406 ldr.setImplDefined2((""+(char)byteArray[18]+(char)byteArray[19]+(char)byteArray[20]).toCharArray());
\r
407 ldr.setEntryMap("4500".toCharArray());
\r
408 if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
\r
410 ldr.setCharCodingScheme((char)byteArray[10]);
\r
413 else if (offset % 12 == 11)
\r
415 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
416 "Leader appears to be too long, moving one byte from leader to record body, and trying again");
\r
417 byte oldBody[] = recordBuf;
\r
418 recordBuf = new byte[oldBody.length+1];
\r
419 System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
\r
420 recordBuf[0] = (byte)'0';
\r
421 directoryLength = offset+1;
\r
422 ldr.setIndicatorCount(2);
\r
423 ldr.setSubfieldCodeLength(2);
\r
424 ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
\r
425 ldr.setImplDefined2((""+(char)byteArray[16]+(char)byteArray[17]+(char)byteArray[18]).toCharArray());
\r
426 ldr.setEntryMap("4500".toCharArray());
\r
427 if (byteArray[8] == (byte)' ' || byteArray[8] == (byte)'a') // if its ' ' or 'a'
\r
429 ldr.setCharCodingScheme((char)byteArray[10]);
\r
431 if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
\r
433 ldr.setCharCodingScheme((char)byteArray[10]);
\r
438 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
439 "error parsing leader with data: " + new String(byteArray));
\r
440 throw new MarcException("error parsing leader with data: "
\r
441 + new String(byteArray), e);
\r
447 throw new MarcException("error parsing leader with data: "
\r
448 + new String(byteArray), e);
\r
451 char tmp[] = ldr.getEntryMap();
\r
452 if (permissive && !(""+ tmp[0]+tmp[1]+tmp[2]+tmp[3]).equals("4500"))
\r
454 if (tmp[0] >= '0' && tmp[0] <= '9' &&
\r
455 tmp[1] >= '0' && tmp[1] <= '9' &&
\r
456 tmp[2] >= '0' && tmp[2] <= '9' &&
\r
457 tmp[3] >= '0' && tmp[3] <= '9')
\r
459 errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
\r
460 "Unusual character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]");
\r
464 errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
\r
465 "Erroneous character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]; changing them to the standard \"4500\"");
\r
466 ldr.setEntryMap("4500".toCharArray());
\r
470 // if MARC 21 then check encoding
\r
471 switch (ldr.getCharCodingScheme()) {
\r
477 encoding = defaultEncoding;
\r
479 encoding = "ISO8859_1";
\r
483 encoding = defaultEncoding;
\r
485 encoding = "ISO8859_1";
\r
490 if (encoding.equalsIgnoreCase("BESTGUESS"))
\r
494 String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
\r
495 // If record has MARC8 character set selection strings, it must be MARC8 encoded
\r
496 if (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1)
\r
498 encoding = "MARC8";
\r
502 boolean hasHighBitChars = false;
\r
503 for (int i = 0; i < recordBuf.length; i++)
\r
505 if (recordBuf[i] < 0) // the high bit is set
\r
507 hasHighBitChars = true;
\r
511 if (!hasHighBitChars)
\r
513 encoding = "ISO8859_1"; // You can choose any encoding you want here, the results will be the same.
\r
517 utfCheck = new String(recordBuf, "UTF-8");
\r
518 byte byteCheck[] = utfCheck.getBytes("UTF-8");
\r
519 encoding = "UTF8";
\r
520 if (recordBuf.length == byteCheck.length)
\r
522 for (int i = 0; i < recordBuf.length; i++)
\r
524 if (byteCheck[i] != recordBuf[i])
\r
526 encoding = "MARC8-Maybe";
\r
533 encoding = "MARC8-Maybe";
\r
538 catch (UnsupportedEncodingException e)
\r
540 // TODO Auto-generated catch block
\r
541 e.printStackTrace();
\r
544 else if (permissive && encoding.equals("UTF8"))
\r
548 utfCheck = new String(recordBuf, "UTF-8");
\r
549 byte byteCheck[] = utfCheck.getBytes("UTF-8");
\r
550 if (recordBuf.length != byteCheck.length)
\r
552 boolean foundESC = false;
\r
553 for (int i = 0; i < recordBuf.length; i++)
\r
555 if (recordBuf[i] == 0x1B)
\r
557 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
\r
558 "Record claims to be UTF-8, but its not. Its probably MARC8.");
\r
559 encoding = "MARC8-Maybe";
\r
563 if (byteCheck[i] != recordBuf[i])
\r
565 encoding = "MARC8-Maybe";
\r
571 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
\r
572 "Record claims to be UTF-8, but its not. It may be MARC8, or maybe UNIMARC, or maybe raw ISO-8859-1 ");
\r
575 if (utfCheck.contains("a$1!"))
\r
577 encoding = "MARC8-Broken";
\r
578 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
579 "Record claims to be UTF-8, but its not. It seems to be MARC8-encoded but with missing escape codes.");
\r
582 catch (UnsupportedEncodingException e)
\r
584 // TODO Auto-generated catch block
\r
585 e.printStackTrace();
\r
588 else if (permissive && !encoding.equals("UTF8"))
\r
592 utfCheck = new String(recordBuf, "UTF-8");
\r
593 byte byteCheck[] = utfCheck.getBytes("UTF-8");
\r
594 if (recordBuf.length == byteCheck.length)
\r
596 for (int i = 0; i < recordBuf.length; i++)
\r
598 // need to check for byte < 0 to see if the high bit is set, because Java doesn't have unsigned types.
\r
599 if (recordBuf[i] < 0x00 || byteCheck[i] != recordBuf[i])
\r
601 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
\r
602 "Record claims not to be UTF-8, but it seems to be.");
\r
603 encoding = "UTF8-Maybe";
\r
609 catch (UnsupportedEncodingException e)
\r
611 // TODO Auto-generated catch block
\r
612 e.printStackTrace();
\r
615 record.setLeader(ldr);
\r
617 boolean discardOneAtStartOfDirectory = false;
\r
618 boolean discardOneSomewhereInDirectory = false;
\r
620 if ((directoryLength % 12) != 0)
\r
622 if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte)'0')
\r
624 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
625 "Directory length is not a multiple of 12 bytes long. Prepending a zero and trying to continue.");
\r
626 byte oldBody[] = recordBuf;
\r
627 recordBuf = new byte[oldBody.length+1];
\r
628 System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
\r
629 recordBuf[0] = (byte)'0';
\r
630 directoryLength = directoryLength+1;
\r
634 if (permissive && directoryLength % 12 == 1 && recordBuf[1] == (byte)'0' && recordBuf[2] == (byte)'0')
\r
636 discardOneAtStartOfDirectory = true;
\r
637 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
638 "Directory length is not a multiple of 12 bytes long. Discarding byte from start of directory and trying to continue.");
\r
640 else if (permissive && directoryLength % 12 == 1 && recordLength > 10000 && recordBuf[0] == (byte)'0' &&
\r
641 recordBuf[1] == (byte)'0' && recordBuf[2] > (byte)'0' && recordBuf[2] <= (byte)'9')
\r
643 discardOneSomewhereInDirectory = true;
\r
644 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
\r
645 "Directory length is not a multiple of 12 bytes long. Will look for oversized field and try to work around it.");
\r
649 if (errors != null)
\r
651 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
652 "Directory length is not a multiple of 12 bytes long. Unable to continue.");
\r
654 throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue.");
\r
658 DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
\r
659 int size = directoryLength / 12;
\r
661 String[] tags = new String[size];
\r
662 int[] lengths = new int[size];
\r
664 byte[] tag = new byte[3];
\r
665 byte[] length = new byte[4];
\r
666 byte[] start = new byte[5];
\r
670 if (discardOneAtStartOfDirectory) inputrec.read();
\r
671 int totalOffset = 0;
\r
672 for (int i = 0; i < size; i++)
\r
674 inputrec.readFully(tag);
\r
675 tmpStr = new String(tag);
\r
678 boolean proceedNormally = true;
\r
679 if (discardOneSomewhereInDirectory)
\r
681 byte lenCheck[] = new byte[10];
\r
683 inputrec.readFully(lenCheck);
\r
684 if (byteCompare(lenCheck, 4, 5, totalOffset)) // proceed normally
\r
686 proceedNormally = true;
\r
688 else if (byteCompare(lenCheck, 5, 5, totalOffset)) // field length is 5 bytes! Bad Marc record, proceed normally
\r
690 discardOneSomewhereInDirectory = false;
\r
691 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
692 "Field is longer than 9999 bytes. Writing this record out will result in a bad record.");
\r
693 proceedNormally = false;
\r
697 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
698 "Unable to reconcile problems in directory. Unable to continue.");
\r
699 throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue.");
\r
703 if (proceedNormally)
\r
705 inputrec.readFully(length);
\r
706 tmpStr = new String(length);
\r
707 lengths[i] = Integer.parseInt(tmpStr);
\r
709 inputrec.readFully(start);
\r
711 else // length is 5 bytes long
\r
713 inputrec.readFully(start);
\r
714 tmpStr = new String(start);
\r
715 lengths[i] = Integer.parseInt(tmpStr);
\r
717 inputrec.readFully(start);
\r
719 totalOffset += lengths[i];
\r
722 // If we still haven't found the extra byte, throw out the last byte and try to continue;
\r
723 if (discardOneSomewhereInDirectory) inputrec.read();
\r
725 if (inputrec.read() != Constants.FT)
\r
727 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
728 "Expected field terminator at end of directory. Unable to continue.");
\r
729 throw new MarcException("expected field terminator at end of directory");
\r
732 int numBadLengths = 0;
\r
734 int totalLength = 0;
\r
735 for (int i = 0; i < size; i++)
\r
737 int fieldLength = getFieldLength(inputrec);
\r
738 if (fieldLength+1 != lengths[i] && permissive)
\r
740 if (numBadLengths < 3 && (totalLength + fieldLength < recordLength + 26))
\r
743 lengths[i] = fieldLength+1;
\r
744 errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
\r
745 "Field length found in record different from length stated in the directory.");
\r
746 if (fieldLength+1 > 9999)
\r
748 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
749 "Field length is greater than 9999, record cannot be represented as a binary Marc record.");
\r
753 totalLength += lengths[i];
\r
754 if (isControlField(tags[i]))
\r
756 byteArray = new byte[lengths[i] - 1];
\r
757 inputrec.readFully(byteArray);
\r
759 if (inputrec.read() != Constants.FT)
\r
761 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
762 "Expected field terminator at end of field. Unable to continue.");
\r
763 throw new MarcException("expected field terminator at end of field");
\r
766 ControlField field = factory.newControlField();
\r
767 field.setTag(tags[i]);
\r
768 field.setData(getDataAsString(byteArray));
\r
769 record.addVariableField(field);
\r
774 byteArray = new byte[lengths[i]];
\r
775 inputrec.readFully(byteArray);
\r
777 record.addVariableField(parseDataField(tags[i], byteArray));
\r
778 } catch (IOException e) {
\r
779 throw new MarcException(
\r
780 "error parsing data field for tag: " + tags[i]
\r
782 + new String(byteArray), e);
\r
787 // We've determined that although the record says it is UTF-8, it is not.
\r
788 // Here we make an attempt to determine the actual encoding of the data in the record.
\r
789 if (permissive && conversionCheck1.length() > 1 &&
\r
790 conversionCheck2.length() > 1 && conversionCheck3.length() > 1)
\r
792 guessAndSelectCorrectNonUTF8Encoding();
\r
794 if (inputrec.read() != Constants.RT)
\r
796 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
797 "Expected record terminator at end of record. Unable to continue.");
\r
798 throw new MarcException("expected record terminator");
\r
801 catch (IOException e)
\r
803 errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
\r
804 "Error reading from data file. Unable to continue.");
\r
805 throw new MarcException("an error occured reading input", e);
\r
809 private boolean byteCompare(byte[] lenCheck, int offset, int length, int totalOffset)
\r
812 for (int i = offset + length - 1; i >= offset; i-- , divisor *= 10)
\r
814 if (((totalOffset / divisor) % 10) + '0' != lenCheck[i])
\r
822 private boolean isControlField(String tag)
\r
824 boolean isControl = false;
\r
826 isControl = Verifier.isControlField(tag);
\r
828 catch (NumberFormatException nfe)
\r
832 errors.addError(record.getControlNumber(), tag, "n/a", ErrorHandler.ERROR_TYPO,
\r
833 "Field tag contains non-numeric characters (" + tag + ").");
\r
840 private void guessAndSelectCorrectNonUTF8Encoding()
\r
842 int defaultPart = 0;
\r
843 if (record.getVariableField("245") == null) defaultPart = 1;
\r
845 int l1 = conversionCheck1.length();
\r
846 int l2 = conversionCheck2.length();
\r
847 int l3 = conversionCheck3.length();
\r
850 if (l1 < l3 && l2 == l3 && defaultPart == 0)
\r
852 errors.addError(ErrorHandler.INFO, "MARC8 translation shorter than ISO-8859-1, choosing MARC8.");
\r
855 else if (l2 < l1-2 && l2 < l3-2 )
\r
857 errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
\r
860 else if ((tst = onlyOneStartsWithUpperCase(conversionCheck1, conversionCheck2, conversionCheck3)) != -1)
\r
864 else if (l2 < l1 && l2 < l3 )
\r
866 errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
\r
869 else if (conversionCheck2.equals(conversionCheck3) && !conversionCheck1.trim().contains(" "))
\r
871 errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations identical, choosing ISO-8859-1.");
\r
874 else if (!specialCharIsBetweenLetters(conversionCheck1))
\r
876 errors.addError(ErrorHandler.INFO, "To few letters in translations, choosing "+(defaultPart == 0 ? "MARC8" : "Unimarc"));
\r
877 partToUse = defaultPart;
\r
879 else if (l2 == l1 && l2 == l3)
\r
881 errors.addError(ErrorHandler.INFO, "All three version equal length. Choosing ISO-8859-1 ");
\r
884 else if (l2 == l3 && defaultPart == 1)
\r
886 errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations equal length, choosing ISO-8859-1.");
\r
891 errors.addError(ErrorHandler.INFO, "No Determination made, defaulting to "+ (defaultPart == 0 ? "MARC8" : "Unimarc") );
\r
892 partToUse = defaultPart;
\r
894 List<VariableField> fields = record.getVariableFields();
\r
895 Iterator<VariableField> iter = fields.iterator();
\r
896 while (iter.hasNext())
\r
898 VariableField field = iter.next();
\r
899 if (field instanceof DataField)
\r
901 DataField df = (DataField)field;
\r
902 List<Subfield> subf = df.getSubfields();
\r
903 Iterator<Subfield> sfiter = subf.iterator();
\r
904 while (sfiter.hasNext())
\r
906 Subfield sf = sfiter.next();
\r
907 if (sf.getData().contains("%%@%%"))
\r
909 String parts[] = sf.getData().split("%%@%%", 3);
\r
910 sf.setData(parts[partToUse]);
\r
917 private int onlyOneStartsWithUpperCase(String conversionCheck12, String conversionCheck22, String conversionCheck32)
\r
919 if (conversionCheck1.length() == 0 || conversionCheck2.length() == 0 || conversionCheck3.length() == 0) return -1;
\r
920 String check1Parts[] = conversionCheck1.trim().split("[|]>");
\r
921 String check2Parts[] = conversionCheck2.trim().split("[|]>");
\r
922 String check3Parts[] = conversionCheck3.trim().split("[|]>");
\r
923 for (int i = 1; i < check1Parts.length && i < check2Parts.length && i < check3Parts.length; i++)
\r
925 boolean tst1 = Character.isUpperCase(check1Parts[i].charAt(0));
\r
926 boolean tst2 = Character.isUpperCase(check2Parts[i].charAt(0));
\r
927 boolean tst3 = Character.isUpperCase(check3Parts[i].charAt(0));
\r
928 if (tst1 && !tst2 && !tst3)
\r
930 if (!tst1 && tst2 && !tst3)
\r
932 if (!tst1 && !tst2 && tst3)
\r
938 private boolean specialCharIsBetweenLetters(String conversionCheck)
\r
940 boolean bewteenLetters = true;
\r
941 for (int i = 0; i < conversionCheck.length(); i++)
\r
943 int charCode = (int)(conversionCheck.charAt(i));
\r
944 if (charCode > 0x7f)
\r
946 bewteenLetters = false;
\r
947 if (i > 0 && Character.isLetter((int)(conversionCheck.charAt(i-1))) ||
\r
948 (i < conversionCheck.length()-1 && Character.isLetter((int)(conversionCheck.charAt(i+1)))))
\r
950 bewteenLetters = true;
\r
955 return(bewteenLetters);
\r
958 private int arrayContainsAt(byte[] byteArray, int ft)
\r
960 for (int i = 0; i < byteArray.length; i++)
\r
962 if (byteArray[i] == (byte)ft) return(i);
\r
967 private DataField parseDataField(String tag, byte[] field) throws IOException
\r
971 errors.setRecordID(record.getControlNumber());
\r
972 errors.setCurrentField(tag);
\r
973 errors.setCurrentSubfield("n/a");
\r
974 cleanupBadFieldSeperators(field);
\r
976 ByteArrayInputStream bais = new ByteArrayInputStream(field);
\r
977 char ind1 = (char) bais.read();
\r
978 char ind2 = (char) bais.read();
\r
980 DataField dataField = factory.newDataField();
\r
981 dataField.setTag(tag);
\r
982 dataField.setIndicator1(ind1);
\r
983 dataField.setIndicator2(ind2);
\r
991 readByte = bais.read();
\r
994 switch (readByte) {
\r
996 code = bais.read();
\r
998 throw new IOException("unexpected end of data field");
\r
999 if (code == Constants.FT)
\r
1001 size = getSubfieldLength(bais);
\r
1002 data = new byte[size];
\r
1004 subfield = factory.newSubfield();
\r
1005 if (permissive) errors.setCurrentSubfield("" + (char)code);
\r
1006 String dataAsString = getDataAsString(data);
\r
1007 if (permissive && code == Constants.US)
\r
1010 dataAsString = dataAsString.substring(1);
\r
1011 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1012 "Subfield tag is a subfield separator, using first character of field as subfield tag.");
\r
1014 subfield.setCode((char) code);
\r
1015 subfield.setData(dataAsString);
\r
1016 dataField.addSubfield(subfield);
\r
1018 case Constants.FT:
\r
1025 static AnselToUnicode conv = null;
\r
1027 private void cleanupBadFieldSeperators(byte[] field)
\r
1029 if (conv == null) conv = new AnselToUnicode(true);
\r
1030 boolean hasEsc = false;
\r
1031 boolean inMultiByte = false;
\r
1032 boolean justCleaned = false;
\r
1035 for (int i = 0 ; i < field.length-1; i++)
\r
1037 if (field[i] == 0x1B)
\r
1040 if ("(,)-'".indexOf((char)field[i+1]) != -1)
\r
1042 inMultiByte = false;
\r
1044 else if (i + 2 < field.length && field[i+1] == '$' && field[i+2] == '1')
\r
1046 inMultiByte = true;
\r
1049 else if (i + 3 < field.length && (field[i+1] == '$' || field[i+2] == '$')&& ( field[i+2] == '1' || field[i+3] == '1'))
\r
1051 inMultiByte = true;
\r
1056 else if (inMultiByte && field[i] != 0x20) mbOffset = ( mbOffset == 0) ? 2 : mbOffset - 1;
\r
1057 if (inMultiByte && mbOffset == 0 && i + 2 < field.length)
\r
1060 byte f1 = field[i];
\r
1061 byte f2 = field[i+1] == 0x20 ? field[i+2] : field[i+1];
\r
1062 byte f3 = (field[i+1] == 0x20 || field[i+2] == 0x20) ? field[i+3] : field[i+2];
\r
1063 c = conv.getMBChar(conv.makeMultibyte((char)((f1 == Constants.US) ? 0x7C : f1),
\r
1064 (char)((f2 == Constants.US) ? 0x7C : f2),
\r
1065 (char)((f3 == Constants.US) ? 0x7C : f3)));
\r
1066 if (c == 0 && !justCleaned)
\r
1068 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1069 "Bad Multibyte character found, reinterpreting data as non-multibyte data");
\r
1070 inMultiByte = false;
\r
1072 else if (c == 0 && justCleaned)
\r
1074 c = conv.getMBChar(conv.makeMultibyte('!',(char)((f2 == Constants.US) ? 0x7C : f2),
\r
1075 (char)((f3 == Constants.US) ? 0x7C : f3)));
\r
1078 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1079 "Bad Multibyte character found, reinterpreting data as non-multibyte data");
\r
1080 inMultiByte = false;
\r
1084 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1085 "Character after restored vertical bar character makes bad multibyte character, changing it to \"!\"");
\r
1090 justCleaned = false;
\r
1091 if (field[i] == Constants.US )
\r
1093 if (inMultiByte && mbOffset != 0)
\r
1096 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1097 "Subfield separator found in middle of a multibyte character, changing it to a vertical bar, and continuing");
\r
1098 if (field[i+1] == '0')
\r
1100 if (field[i+2] == '(' && field[i+3] == 'B' )
\r
1102 field[i+1] = 0x1B;
\r
1103 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1104 "Character after restored vertical bar character makes bad multibyte character, changing it to ESC");
\r
1108 field[i+1] = 0x21;
\r
1109 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1110 "Character after restored vertical bar character makes bad multibyte character, changing it to \"!\"");
\r
1113 justCleaned = true;
\r
1115 else if (hasEsc && !((field[i+1] >= 'a' && field[i+1] <= 'z') || (field[i+1] >= '0' && field[i+1] <= '9')))
\r
1117 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1118 "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing");
\r
1120 justCleaned = true;
\r
1122 else if (hasEsc && i < field.length-3 &&
\r
1123 (field[i+1] == '0' && field[i+2] == '(' && field[i+3] == 'B' ))
\r
1125 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1126 "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing");
\r
1128 field[i+1] = 0x1B;
\r
1129 justCleaned = true;
\r
1131 else if (hasEsc && (field[i+1] == '0' ))
\r
1133 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1134 "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing");
\r
1136 field[i+1] = 0x21;
\r
1137 justCleaned = true;
\r
1139 else if (field[i+1] == Constants.US && field[i+2] == Constants.US )
\r
1141 errors.addError(ErrorHandler.MAJOR_ERROR,
\r
1142 "Three consecutive subfield separators, changing first two to vertical bars.");
\r
1144 field[i+1] = 0x7C;
\r
1145 justCleaned = true;
\r
1151 private int getFieldLength(DataInputStream bais) throws IOException
\r
1154 int bytesRead = 0;
\r
1156 switch (bais.read()) {
\r
1157 case Constants.FT:
\r
1164 errors.addError(ErrorHandler.MINOR_ERROR,
\r
1165 "Field not terminated trying to continue");
\r
1166 return (bytesRead);
\r
1169 throw new IOException("Field not terminated");
\r
1170 case Constants.US:
\r
1177 private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
\r
1179 int bytesRead = 0;
\r
1181 switch (bais.read()) {
\r
1182 case Constants.FT:
\r
1185 case Constants.US:
\r
1192 errors.addError(ErrorHandler.MINOR_ERROR, "Subfield not terminated trying to continue");
\r
1193 return (bytesRead);
\r
1196 throw new IOException("subfield not terminated");
\r
1203 private int parseRecordLength(byte[] leaderData) throws IOException {
\r
1204 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
1207 char[] tmp = new char[5];
\r
1210 length = Integer.parseInt(new String(tmp));
\r
1211 } catch (NumberFormatException e) {
\r
1212 errors.addError(ErrorHandler.FATAL,
\r
1213 "Unable to parse record length, Unable to Continue");
\r
1214 throw new MarcException("unable to parse record length", e);
\r
1219 private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
\r
1220 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
1222 char[] tmp = new char[5];
\r
1224 // Skip over bytes for record length, If we get here, its already been computed.
\r
1225 ldr.setRecordStatus((char) isr.read());
\r
1226 ldr.setTypeOfRecord((char) isr.read());
\r
1227 tmp = new char[2];
\r
1229 ldr.setImplDefined1(tmp);
\r
1230 ldr.setCharCodingScheme((char) isr.read());
\r
1231 char indicatorCount = (char) isr.read();
\r
1232 char subfieldCodeLength = (char) isr.read();
\r
1233 char baseAddr[] = new char[5];
\r
1234 isr.read(baseAddr);
\r
1235 tmp = new char[3];
\r
1237 ldr.setImplDefined2(tmp);
\r
1238 tmp = new char[4];
\r
1240 ldr.setEntryMap(tmp);
\r
1243 ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
\r
1244 } catch (NumberFormatException e) {
\r
1245 throw new MarcException("unable to parse indicator count", e);
\r
1248 ldr.setSubfieldCodeLength(Integer.parseInt(String
\r
1249 .valueOf(subfieldCodeLength)));
\r
1250 } catch (NumberFormatException e) {
\r
1251 throw new MarcException("unable to parse subfield code length", e);
\r
1254 ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
\r
1255 } catch (NumberFormatException e) {
\r
1256 throw new MarcException("unable to parse base address of data", e);
\r
1261 private String getDataAsString(byte[] bytes)
\r
1263 String dataElement = null;
\r
1264 if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
\r
1267 dataElement = new String(bytes, "UTF-8");
\r
1269 catch (UnsupportedEncodingException e) {
\r
1270 throw new MarcException("unsupported encoding", e);
\r
1273 else if (encoding.equals("UTF8-Maybe"))
\r
1276 dataElement = new String(bytes, "UTF-8");
\r
1278 catch (UnsupportedEncodingException e) {
\r
1279 throw new MarcException("unsupported encoding", e);
\r
1282 else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
\r
1284 dataElement = getMarc8Conversion(bytes);
\r
1286 else if (encoding.equalsIgnoreCase("Unimarc") || encoding.equals("IS05426"))
\r
1288 dataElement = getUnimarcConversion(bytes);
\r
1290 else if (encoding.equals("MARC8-Maybe"))
\r
1292 String dataElement1 = getMarc8Conversion(bytes);
\r
1293 String dataElement2 = getUnimarcConversion(bytes);
\r
1294 String dataElement3 = null;
\r
1297 dataElement3 = new String(bytes, "ISO-8859-1");
\r
1299 catch (UnsupportedEncodingException e)
\r
1301 // TODO Auto-generated catch block
\r
1302 e.printStackTrace();
\r
1304 if (dataElement1.equals(dataElement2) && dataElement1.equals(dataElement3))
\r
1306 dataElement = dataElement1;
\r
1310 conversionCheck1 = conversionCheck1 + "|>" + Normalizer.compose(dataElement1, false);
\r
1311 conversionCheck2 = conversionCheck2 + "|>" + dataElement2;
\r
1312 conversionCheck3 = conversionCheck3 + "|>" + dataElement3;
\r
1313 dataElement = dataElement1 + "%%@%%" + dataElement2 + "%%@%%" + dataElement3;
\r
1316 else if (encoding.equals("MARC8-Broken"))
\r
1320 dataElement = new String(bytes, "ISO-8859-1");
\r
1322 catch (UnsupportedEncodingException e)
\r
1324 // TODO Auto-generated catch block
\r
1325 e.printStackTrace();
\r
1327 String newdataElement = dataElement.replaceAll("<", "<");
\r
1328 newdataElement = newdataElement.replaceAll(">", ">");
\r
1329 newdataElement = newdataElement.replaceAll("&", "&");
\r
1330 newdataElement = newdataElement.replaceAll("'", "'");
\r
1331 newdataElement = newdataElement.replaceAll(""", "\"");
\r
1332 if (!newdataElement.equals(dataElement))
\r
1334 dataElement = newdataElement;
\r
1335 errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains escaped html character entities, un-escaping them. ");
\r
1337 String rep1 = ""+(char)0x1b+"\\$1$1";
\r
1338 String rep2 = ""+(char)0x1b+"\\(B";
\r
1339 newdataElement = dataElement.replaceAll("\\$1(.)", rep1);
\r
1340 newdataElement = newdataElement.replaceAll("\\(B", rep2);
\r
1341 if (!newdataElement.equals(dataElement))
\r
1343 dataElement = newdataElement;
\r
1344 errors.addError(ErrorHandler.MAJOR_ERROR, "Subfield seems to be missing MARC8 escape sequences, trying to restore them.");
\r
1348 dataElement = getMarc8Conversion(dataElement.getBytes("ISO-8859-1"));
\r
1350 catch (UnsupportedEncodingException e)
\r
1352 // TODO Auto-generated catch block
\r
1353 e.printStackTrace();
\r
1357 else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
\r
1360 dataElement = new String(bytes, "ISO-8859-1");
\r
1362 catch (UnsupportedEncodingException e) {
\r
1363 throw new MarcException("unsupported encoding", e);
\r
1368 throw new MarcException("Unknown or unsupported Marc character encoding:" + encoding);
\r
1370 if (errors != null && dataElement.matches("[^&]*&[a-z]*;.*"))
\r
1372 String newdataElement = dataElement.replaceAll("<", "<");
\r
1373 newdataElement = newdataElement.replaceAll(">", ">");
\r
1374 newdataElement = newdataElement.replaceAll("&", "&");
\r
1375 newdataElement = newdataElement.replaceAll("'", "'");
\r
1376 newdataElement = newdataElement.replaceAll(""", "\"");
\r
1377 if (!newdataElement.equals(dataElement))
\r
1379 dataElement = newdataElement;
\r
1380 errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains escaped html character entities, un-escaping them. ");
\r
1383 return dataElement;
\r
1386 private boolean byteArrayContains(byte[] bytes, byte[] seq)
\r
1388 for ( int i = 0; i < bytes.length - seq.length; i++)
\r
1390 if (bytes[i] == seq[0])
\r
1392 for (int j = 0; j < seq.length; j++)
\r
1394 if (bytes[i+j] != seq[j])
\r
1398 if (j == seq.length-1) return(true);
\r
1405 static byte badEsc[] = { (byte)('b'), (byte)('-'), 0x1b, (byte)('s') };
\r
1406 static byte overbar[] = { (byte)(char)(0xaf) };
\r
1408 private String getMarc8Conversion(byte[] bytes)
\r
1410 String dataElement = null;
\r
1411 if (converterAnsel == null) converterAnsel = new AnselToUnicode(errors);
\r
1412 if (permissive && (byteArrayContains(bytes, badEsc) || byteArrayContains(bytes, overbar)))
\r
1414 String newDataElement = null;
\r
1417 dataElement = new String(bytes, "ISO-8859-1");
\r
1418 newDataElement = dataElement.replaceAll("(\\e)b-\\es([psb])", "$1$2");
\r
1419 if (!newDataElement.equals(dataElement))
\r
1421 dataElement = newDataElement;
\r
1422 errors.addError(ErrorHandler.MINOR_ERROR, "Subfield contains odd pattern of subscript or superscript escapes. ");
\r
1424 newDataElement = dataElement.replace((char)0xaf, (char)0xe5);
\r
1425 if (!newDataElement.equals(dataElement))
\r
1427 dataElement = newDataElement;
\r
1428 errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains 0xaf overbar character, changing it to proper MARC8 representation ");
\r
1430 dataElement = converterAnsel.convert(dataElement);
\r
1432 catch (UnsupportedEncodingException e)
\r
1434 // TODO Auto-generated catch block
\r
1435 e.printStackTrace();
\r
1440 dataElement = converterAnsel.convert(bytes);
\r
1442 if (permissive && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*"))
\r
1444 Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);");
\r
1445 Matcher matcher = pattern.matcher(dataElement);
\r
1446 StringBuffer newElement = new StringBuffer();
\r
1448 while (matcher.find())
\r
1450 newElement.append(dataElement.substring(prevEnd, matcher.start()));
\r
1451 newElement.append(getChar(matcher.group(1)));
\r
1452 prevEnd = matcher.end();
\r
1454 newElement.append(dataElement.substring(prevEnd));
\r
1455 dataElement = newElement.toString();
\r
1457 return(dataElement);
\r
1460 private String getUnimarcConversion(byte[] bytes)
\r
1462 if (converterUnimarc == null) converterUnimarc = new Iso5426ToUnicode();
\r
1463 String dataElement = converterUnimarc.convert(bytes);
\r
1464 dataElement = dataElement.replaceAll("\u0088", "");
\r
1465 dataElement = dataElement.replaceAll("\u0089", "");
\r
1466 // for ( int i = 0 ; i < bytes.length; i++)
\r
1468 // if (bytes[i] == -120 || bytes[i] == -119)
\r
1470 // char tmp = (char)bytes[i];
\r
1471 // char temp2 = dataElement.charAt(0);
\r
1472 // char temp3 = dataElement.charAt(4);
\r
1473 // int tmpi = (int)tmp;
\r
1474 // int tmp2 = (int)temp2;
\r
1475 // int tmp3 = (int)temp3;
\r
1480 if (dataElement.matches("[^<]*<U[+][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]>.*"))
\r
1482 Pattern pattern = Pattern.compile("<U[+]([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])>");
\r
1483 Matcher matcher = pattern.matcher(dataElement);
\r
1484 StringBuffer newElement = new StringBuffer();
\r
1486 while (matcher.find())
\r
1488 newElement.append(dataElement.substring(prevEnd, matcher.start()));
\r
1489 newElement.append(getChar(matcher.group(1)));
\r
1490 prevEnd = matcher.end();
\r
1492 newElement.append(dataElement.substring(prevEnd));
\r
1493 dataElement = newElement.toString();
\r
1495 return(dataElement);
\r
1499 private String getChar(String charCodePoint)
\r
1501 int charNum = Integer.parseInt(charCodePoint, 16);
\r
1502 String result = ""+((char)charNum);
\r