1 // $Id: MarcStreamReader.java,v 1.11 2008/09/26 21:17:42 haschart Exp $
\r
3 * Copyright (C) 2004 Bas Peters
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
23 import java.io.BufferedInputStream;
\r
24 import java.io.ByteArrayInputStream;
\r
25 import java.io.DataInputStream;
\r
26 import java.io.EOFException;
\r
27 import java.io.IOException;
\r
28 import java.io.InputStream;
\r
29 import java.io.InputStreamReader;
\r
30 import java.io.UnsupportedEncodingException;
\r
32 import org.marc4j.converter.CharConverter;
\r
33 import org.marc4j.converter.impl.AnselToUnicode;
\r
34 import org.marc4j.marc.ControlField;
\r
35 import org.marc4j.marc.DataField;
\r
36 import org.marc4j.marc.Leader;
\r
37 import org.marc4j.marc.MarcFactory;
\r
38 import org.marc4j.marc.Record;
\r
39 import org.marc4j.marc.Subfield;
\r
40 import org.marc4j.marc.impl.Verifier;
\r
43 * An iterator over a collection of MARC records in ISO 2709 format.
\r
48 * InputStream input = new FileInputStream("file.mrc");
\r
49 * MarcReader reader = new MarcStreamReader(input);
\r
50 * while (reader.hasNext()) {
\r
51 * Record record = reader.next();
\r
57 * Check the {@link org.marc4j.marc} package for examples about the use of
\r
58 * the {@link org.marc4j.marc.Record} object model.
\r
62 * When no encoding is given as an constructor argument the parser tries to
\r
63 * resolve the encoding by looking at the character coding scheme (leader
\r
64 * position 9) in MARC21 records. For UNIMARC records this position is not
\r
68 * @author Bas Peters
\r
69 * @version $Revision: 1.11 $
\r
72 public class MarcStreamReader implements MarcReader {
\r
74 private DataInputStream input = null;
\r
76 private Record record;
\r
78 private MarcFactory factory;
\r
80 private String encoding = "ISO8859_1";
\r
82 private boolean override = false;
\r
84 private CharConverter converterAnsel = null;
\r
86 private boolean setBadIndicators = true;
\r
88 byte[] leaderBuffer;
\r
90 * Constructs an instance with the specified input stream.
\r
92 public MarcStreamReader(InputStream input) {
\r
97 * Constructs an instance with the specified input stream.
\r
99 public MarcStreamReader(InputStream input, String encoding) {
\r
100 this.input = new DataInputStream(new BufferedInputStream(input));
\r
101 factory = MarcFactory.newInstance();
\r
102 if (encoding != null) {
\r
103 this.encoding = encoding;
\r
109 * Returns true if the iteration has more records, false otherwise.
\r
111 public boolean hasNext() {
\r
114 available = input.available();
\r
116 leaderBuffer = new byte[24];
\r
117 input.readFully(leaderBuffer);
\r
118 } catch (EOFException eof) {
\r
119 // If we are not capable of reading the leader before EOF, we cannot read a record
\r
120 // This happens when we read gzipped marc files, that it returns available bytes, but none is present
\r
123 if (available == 0)
\r
125 } catch (IOException e) {
\r
126 throw new MarcException(e.getMessage(), e);
\r
132 * Returns the next record in the iteration.
\r
134 * @return Record - the record object
\r
136 public Record next()
\r
138 record = factory.newRecord();
\r
141 int recordLength = parseRecordLength(leaderBuffer);
\r
142 byte[] recordBuf = new byte[recordLength - 24];
\r
143 input.readFully(recordBuf);
\r
144 parseRecord(record, leaderBuffer, recordBuf, recordLength);
\r
147 catch (EOFException e) {
\r
148 throw new MarcException("Premature end of file encountered", e);
\r
150 catch (IOException e) {
\r
151 throw new MarcException("an error occured reading input", e);
\r
155 private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
\r
158 ldr = factory.newLeader();
\r
159 ldr.setRecordLength(recordLength);
\r
160 int directoryLength=0;
\r
163 parseLeader(ldr, byteArray);
\r
164 directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
\r
166 catch (IOException e) {
\r
167 throw new MarcException("error parsing leader with data: "
\r
168 + new String(byteArray), e);
\r
170 catch (MarcException e) {
\r
171 throw new MarcException("error parsing leader with data: "
\r
172 + new String(byteArray), e);
\r
175 // if MARC 21 then check encoding
\r
176 switch (ldr.getCharCodingScheme()) {
\r
179 encoding = "ISO-8859-1";
\r
185 record.setLeader(ldr);
\r
187 if ((directoryLength % 12) != 0)
\r
189 throw new MarcException("invalid directory");
\r
191 DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
\r
192 int size = directoryLength / 12;
\r
194 String[] tags = new String[size];
\r
195 int[] lengths = new int[size];
\r
197 byte[] tag = new byte[3];
\r
198 byte[] length = new byte[4];
\r
199 byte[] start = new byte[5];
\r
204 for (int i = 0; i < size; i++)
\r
206 inputrec.readFully(tag);
\r
207 tmp = new String(tag);
\r
210 inputrec.readFully(length);
\r
211 tmp = new String(length);
\r
212 lengths[i] = Integer.parseInt(tmp);
\r
214 inputrec.readFully(start);
\r
217 if (inputrec.read() != Constants.FT)
\r
219 throw new MarcException("expected field terminator at end of directory");
\r
222 for (int i = 0; i < size; i++)
\r
224 //int fieldLength = getFieldLength(inputrec);
\r
225 if (Verifier.isControlField(tags[i]))
\r
227 byteArray = new byte[lengths[i] - 1];
\r
228 inputrec.readFully(byteArray);
\r
230 if (inputrec.read() != Constants.FT)
\r
232 throw new MarcException("expected field terminator at end of field");
\r
235 ControlField field = factory.newControlField();
\r
236 field.setTag(tags[i]);
\r
237 field.setData(getDataAsString(byteArray));
\r
238 record.addVariableField(field);
\r
242 byteArray = new byte[lengths[i]];
\r
243 inputrec.readFully(byteArray);
\r
246 DataField dataField = parseDataField(tags[i], byteArray);
\r
247 // dataField could be null if bad indicators
\r
248 if (dataField != null)
\r
249 record.addVariableField(dataField);
\r
250 } catch (IOException e) {
\r
251 throw new MarcException(
\r
252 "error parsing data field for tag: " + tags[i]
\r
254 + new String(byteArray), e);
\r
259 if (inputrec.read() != Constants.RT)
\r
261 throw new MarcException("expected record terminator");
\r
264 catch (IOException e)
\r
266 throw new MarcException("an error occured reading input", e);
\r
270 private DataField parseDataField(String tag, byte[] field)
\r
271 throws IOException {
\r
272 ByteArrayInputStream bais = new ByteArrayInputStream(field);
\r
273 char ind1 = (char) bais.read();
\r
274 char ind2 = (char) bais.read();
\r
276 DataField dataField = factory.newDataField();
\r
277 dataField.setTag(tag);
\r
279 boolean badIndicatorFound = false;
\r
280 if (setBadIndicators || ind1 >= ' ' )
\r
281 dataField.setIndicator1(ind1);
\r
283 badIndicatorFound = true;
\r
284 if (setBadIndicators || ind2 >= ' ')
\r
285 dataField.setIndicator2(ind2);
\r
287 badIndicatorFound = true;
\r
294 readByte = bais.read();
\r
297 switch (readByte) {
\r
299 code = bais.read();
\r
301 throw new IOException("unexpected end of data field");
\r
302 if (code == Constants.FT)
\r
304 size = getSubfieldLength(bais);
\r
305 data = new byte[size];
\r
307 subfield = factory.newSubfield();
\r
308 subfield.setCode((char) code);
\r
309 subfield.setData(getDataAsString(data));
\r
310 dataField.addSubfield(subfield);
\r
316 /* Bad Indicators was found, so dropping field */
\r
317 if (badIndicatorFound)
\r
322 @SuppressWarnings("unused")
\r
323 private int getFieldLength(DataInputStream bais) throws IOException
\r
328 switch (bais.read()) {
\r
334 throw new IOException("Field not terminated");
\r
342 private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
\r
346 switch (bais.read()) {
\r
353 throw new IOException("subfield not terminated");
\r
360 private int parseRecordLength(byte[] leaderData) throws IOException {
\r
361 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
364 char[] tmp = new char[5];
\r
367 length = Integer.parseInt(new String(tmp));
\r
368 } catch (NumberFormatException e) {
\r
369 throw new MarcException("unable to parse record length", e);
\r
374 private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
\r
375 InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
\r
377 char[] tmp = new char[5];
\r
379 // Skip over bytes for record length, If we get here, its already been computed.
\r
380 ldr.setRecordStatus((char) isr.read());
\r
381 ldr.setTypeOfRecord((char) isr.read());
\r
384 ldr.setImplDefined1(tmp);
\r
385 ldr.setCharCodingScheme((char) isr.read());
\r
386 char indicatorCount = (char) isr.read();
\r
387 char subfieldCodeLength = (char) isr.read();
\r
388 char baseAddr[] = new char[5];
\r
389 isr.read(baseAddr);
\r
392 ldr.setImplDefined2(tmp);
\r
395 ldr.setEntryMap(tmp);
\r
398 ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
\r
399 } catch (NumberFormatException e) {
\r
400 throw new MarcException("unable to parse indicator count", e);
\r
403 ldr.setSubfieldCodeLength(Integer.parseInt(String
\r
404 .valueOf(subfieldCodeLength)));
\r
405 } catch (NumberFormatException e) {
\r
406 throw new MarcException("unable to parse subfield code length", e);
\r
409 ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
\r
410 } catch (NumberFormatException e) {
\r
411 throw new MarcException("unable to parse base address of data", e);
\r
416 private String getDataAsString(byte[] bytes)
\r
418 String dataElement = null;
\r
419 if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
\r
422 dataElement = new String(bytes, "UTF8");
\r
424 catch (UnsupportedEncodingException e) {
\r
425 throw new MarcException("unsupported encoding", e);
\r
428 else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
\r
430 if (converterAnsel == null) converterAnsel = new AnselToUnicode();
\r
432 for (int index = 0; index < bytes.length; index++)
\r
433 if (bytes[index] < 32)
\r
434 bytes[index] = ' ';
\r
435 dataElement = converterAnsel.convert(bytes);
\r
436 //dataElement = dataElement.replaceAll("\0", " ");
\r
438 else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
\r
441 dataElement = new String(bytes, "ISO-8859-1");
\r
443 catch (UnsupportedEncodingException e) {
\r
444 throw new MarcException("unsupported encoding", e);
\r
447 return dataElement;
\r
450 public boolean isBadIndicators() {
\r
451 return setBadIndicators;
\r
454 public void setBadIndicators(boolean trueFalse) {
\r
455 this.setBadIndicators = trueFalse;
\r