1 //$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $
\r
3 * Copyright (C) 2004 Bas Peters
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
23 import java.io.BufferedWriter;
\r
24 import java.io.IOException;
\r
25 import java.io.OutputStream;
\r
26 import java.io.OutputStreamWriter;
\r
27 import java.io.UnsupportedEncodingException;
\r
28 import java.io.Writer;
\r
29 import java.util.Iterator;
\r
31 import javax.xml.transform.OutputKeys;
\r
32 import javax.xml.transform.Result;
\r
33 import javax.xml.transform.Source;
\r
34 import javax.xml.transform.TransformerFactory;
\r
35 import javax.xml.transform.sax.SAXTransformerFactory;
\r
36 import javax.xml.transform.sax.TransformerHandler;
\r
37 import javax.xml.transform.stream.StreamResult;
\r
38 import javax.xml.transform.stream.StreamSource;
\r
40 import org.marc4j.converter.CharConverter;
\r
41 import org.marc4j.marc.ControlField;
\r
42 import org.marc4j.marc.DataField;
\r
43 import org.marc4j.marc.Leader;
\r
44 import org.marc4j.marc.Record;
\r
45 import org.marc4j.marc.Subfield;
\r
46 import org.xml.sax.SAXException;
\r
47 import org.xml.sax.helpers.AttributesImpl;
\r
49 import com.ibm.icu.text.Normalizer;
\r
52 * Class for writing MARC record objects in MARCXML format. This class outputs a
\r
53 * SAX event stream to the given {@link java.io.OutputStream} or
\r
54 * {@link javax.xml.transform.Result} object. It can be used in a SAX
\r
55 * pipeline to postprocess the result. By default this class uses a nulll
\r
56 * transform. It is strongly recommended to use a dedicated XML serializer.
\r
59 * This class requires a JAXP compliant XML parser and XSLT processor. The
\r
60 * underlying SAX2 parser should be namespace aware. In addition this class
\r
61 * requires <a href="http://icu.sourceforge.net/">ICU4J </a> to perform Unicode
\r
62 * normalization. A stripped down version of 2.6 originating from the <a
\r
63 * href="http://www.cafeconleche.org/XOM/">XOM </a> project is included in this
\r
67 * The following example reads a file with MARC records and writes MARCXML
\r
68 * records in UTF-8 encoding to the console:
\r
73 * InputStream input = new FileInputStream("input.mrc")
\r
74 * MarcReader reader = new MarcStreamReader(input);
\r
76 * MarcWriter writer = new MarcXmlWriter(System.out, true);
\r
77 * while (reader.hasNext()) {
\r
78 * Record record = reader.next();
\r
79 * writer.write(record);
\r
86 * To perform a character conversion like MARC-8 to UCS/Unicode register a
\r
87 * <code>CharConverter</code>:
\r
91 * writer.setConverter(new AnselToUnicode());
\r
95 * In addition you can perform Unicode normalization. This is for example not
\r
96 * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text
\r
97 * is transformed into the canonical composed form. For example "a�bc"
\r
98 * is normalized to "�bc". To perform normalization set Unicode
\r
99 * normalization to true:
\r
103 * writer.setUnicodeNormalization(true);
\r
107 * Please note that it's not garanteed to work if you try to convert normalized
\r
108 * Unicode back to MARC-8 encoding using
\r
109 * {@link org.marc4j.converter.impl.UnicodeToAnsel}.
\r
112 * This class provides very basic formatting options. For more advanced options
\r
113 * create an instance of this class with a
\r
114 * {@link javax.xml.transform.sax.SAXResult} containing a
\r
115 * {@link org.xml.sax.ContentHandler} derived from a dedicated XML
\r
120 * The following example uses
\r
121 * <code>org.apache.xml.serialize.XMLSerializer</code> to write MARC records
\r
122 * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization:
\r
127 * InputStream input = new FileInputStream("input.mrc")
\r
128 * MarcReader reader = new MarcStreamReader(input);
\r
130 * OutputFormat format = new OutputFormat("xml","UTF-8", true);
\r
131 * OutputStream out = new FileOutputStream("output.xml");
\r
132 * XMLSerializer serializer = new XMLSerializer(out, format);
\r
133 * Result result = new SAXResult(serializer.asContentHandler());
\r
135 * MarcXmlWriter writer = new MarcXmlWriter(result);
\r
136 * writer.setConverter(new AnselToUnicode());
\r
137 * while (reader.hasNext()) {
\r
138 * Record record = reader.next();
\r
139 * writer.write(record);
\r
146 * You can post-process the result using a <code>Source</code> object pointing
\r
147 * to a stylesheet resource and a <code>Result</code> object to hold the
\r
148 * transformation result tree. The example below converts MARC to MARCXML and
\r
149 * transforms the result tree to MODS using the stylesheet provided by The
\r
150 * Library of Congress:
\r
155 * String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl";
\r
156 * Source stylesheet = new StreamSource(stylesheetUrl);
\r
158 * Result result = new StreamResult(System.out);
\r
160 * InputStream input = new FileInputStream("input.mrc")
\r
161 * MarcReader reader = new MarcStreamReader(input);
\r
162 * MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);
\r
163 * writer.setConverter(new AnselToUnicode());
\r
164 * while (reader.hasNext()) {
\r
165 * Record record = (Record) reader.next();
\r
166 * writer.write(record);
\r
173 * It is also possible to write the result into a DOM Node:
\r
178 * InputStream input = new FileInputStream("input.mrc")
\r
179 * MarcReader reader = new MarcStreamReader(input);
\r
180 * DOMResult result = new DOMResult();
\r
181 * MarcXmlWriter writer = new MarcXmlWriter(result);
\r
182 * writer.setConverter(new AnselToUnicode());
\r
183 * while (reader.hasNext()) {
\r
184 * Record record = (Record) reader.next();
\r
185 * writer.write(record);
\r
189 * Document doc = (Document) result.getNode();
\r
193 * @author Bas Peters
\r
194 * @version $Revision: 1.9 $
\r
197 public class TurboMarcXmlWriter implements MarcWriter {
\r
199 protected static final String prefix = "tmarc:";
\r
201 protected static final String CONTROL_FIELD = "c";
\r
202 protected static final String Q_CONTROL_FIELD = prefix + CONTROL_FIELD;
\r
204 protected static final String DATA_FIELD = "d";
\r
205 protected static final String Q_DATA_FIELD = prefix + DATA_FIELD;
\r
207 protected static final String SUBFIELD = "s";
\r
208 protected static final String Q_SUBFIELD = prefix + SUBFIELD;
\r
210 protected static final String COLLECTION = "collection";
\r
211 protected static final String Q_COLLECTION = prefix + COLLECTION;
\r
213 protected static final String RECORD = "r";
\r
214 protected static final String Q_RECORD = prefix + RECORD;
\r
216 protected static final String LEADER = "l";
\r
217 protected static final String Q_LEADER = prefix + LEADER;
\r
219 private boolean indent = false;
\r
221 private TransformerHandler handler = null;
\r
223 private Writer writer = null;
\r
227 * Character encoding. Default is UTF-8.
\r
229 //private String encoding = "UTF8";
\r
231 private CharConverter converter = null;
\r
233 private boolean normalize = false;
\r
236 * Constructs an instance with the specified output stream.
\r
238 * The default character encoding for UTF-8 is used.
\r
240 * @throws MarcException
\r
242 public TurboMarcXmlWriter(OutputStream out) {
\r
247 * Constructs an instance with the specified output stream and indentation.
\r
249 * The default character encoding for UTF-8 is used.
\r
251 * @throws MarcException
\r
253 public TurboMarcXmlWriter(OutputStream out, boolean indent) {
\r
254 this(out, "UTF8", indent);
\r
258 * Constructs an instance with the specified output stream and character
\r
261 * @throws MarcException
\r
263 public TurboMarcXmlWriter(OutputStream out, String encoding) {
\r
264 this(out, encoding, false);
\r
268 * Constructs an instance with the specified output stream, character
\r
269 * encoding and indentation.
\r
271 * @throws MarcException
\r
273 public TurboMarcXmlWriter(OutputStream out, String encoding, boolean indent) {
\r
275 throw new NullPointerException("null OutputStream");
\r
277 if (encoding == null) {
\r
278 throw new NullPointerException("null encoding");
\r
282 writer = new OutputStreamWriter(out, encoding);
\r
283 writer = new BufferedWriter(writer);
\r
284 // this.encoding = encoding;
\r
285 setHandler(new StreamResult(writer), null);
\r
286 } catch (UnsupportedEncodingException e) {
\r
287 throw new MarcException(e.getMessage(), e);
\r
289 writeStartDocument();
\r
293 * Constructs an instance with the specified result.
\r
296 * @throws SAXException
\r
298 public TurboMarcXmlWriter(Result result) {
\r
299 if (result == null)
\r
300 throw new NullPointerException("null Result");
\r
301 setHandler(result, null);
\r
302 writeStartDocument();
\r
306 * Constructs an instance with the specified stylesheet location and result.
\r
309 * @throws SAXException
\r
311 public TurboMarcXmlWriter(Result result, String stylesheetUrl) {
\r
312 this(result, new StreamSource(stylesheetUrl));
\r
316 * Constructs an instance with the specified stylesheet source and result.
\r
319 * @throws SAXException
\r
321 public TurboMarcXmlWriter(Result result, Source stylesheet) {
\r
322 if (stylesheet == null)
\r
323 throw new NullPointerException("null Source");
\r
324 if (result == null)
\r
325 throw new NullPointerException("null Result");
\r
326 setHandler(result, stylesheet);
\r
327 writeStartDocument();
\r
330 public void close() {
\r
331 writeEndDocument();
\r
333 if (writer != null)
\r
335 } catch (IOException e) {
\r
336 throw new MarcException(e.getMessage(), e);
\r
341 * Returns the character converter.
\r
343 * @return CharConverter the character converter
\r
345 public CharConverter getConverter() {
\r
350 * Sets the character converter.
\r
353 * the character converter
\r
355 public void setConverter(CharConverter converter) {
\r
356 this.converter = converter;
\r
360 * If set to true this writer will perform Unicode normalization on data
\r
361 * elements using normalization form C (NFC). The default is false.
\r
363 * The implementation used is ICU4J 2.6. This version is based on Unicode
\r
367 * true if this writer performs Unicode normalization, false
\r
370 public void setUnicodeNormalization(boolean normalize) {
\r
371 this.normalize = normalize;
\r
375 * Returns true if this writer will perform Unicode normalization, false
\r
378 * @return boolean - true if this writer performs Unicode normalization,
\r
381 public boolean getUnicodeNormalization() {
\r
385 protected void setHandler(Result result, Source stylesheet)
\r
386 throws MarcException {
\r
388 TransformerFactory factory = TransformerFactory.newInstance();
\r
389 if (!factory.getFeature(SAXTransformerFactory.FEATURE))
\r
390 throw new UnsupportedOperationException(
\r
391 "SAXTransformerFactory is not supported");
\r
393 SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory;
\r
394 if (stylesheet == null)
\r
395 handler = saxFactory.newTransformerHandler();
\r
397 handler = saxFactory.newTransformerHandler(stylesheet);
\r
398 handler.getTransformer()
\r
399 .setOutputProperty(OutputKeys.METHOD, "xml");
\r
400 handler.setResult(result);
\r
402 } catch (Exception e) {
\r
403 throw new MarcException(e.getMessage(), e);
\r
408 * Writes the root start tag to the result.
\r
410 * @throws SAXException
\r
412 protected void writeStartDocument() {
\r
414 AttributesImpl atts = new AttributesImpl();
\r
415 handler.startDocument();
\r
416 // The next line duplicates the namespace declaration for Marc XML
\r
417 handler.startPrefixMapping("tmarc", Constants.TURBO_MARCXML_NS_URI);
\r
418 // add namespace declaration using attribute - need better solution
\r
419 atts.addAttribute(Constants.TURBO_MARCXML_NS_URI, "xmlns", "xmlns:tmarc",
\r
420 "CDATA", Constants.TURBO_MARCXML_NS_URI);
\r
421 handler.startElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, Q_COLLECTION, atts);
\r
422 } catch (SAXException e) {
\r
423 throw new MarcException(
\r
424 "SAX error occured while writing start document", e);
\r
429 * Writes the root end tag to the result.
\r
431 * @throws SAXException
\r
433 protected void writeEndDocument() {
\r
436 handler.ignorableWhitespace("\n".toCharArray(), 0, 1);
\r
439 .endElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION,
\r
441 handler.endPrefixMapping("");
\r
442 handler.endDocument();
\r
443 } catch (SAXException e) {
\r
444 throw new MarcException(
\r
445 "SAX error occured while writing end document", e);
\r
450 * Writes a Record object to the result.
\r
453 * the <code>Record</code> object
\r
454 * @throws SAXException
\r
456 public void write(Record record) {
\r
459 } catch (SAXException e) {
\r
460 throw new MarcException("SAX error occured while writing record", e);
\r
465 * Returns true if indentation is active, false otherwise.
\r
469 public boolean hasIndent() {
\r
474 * Activates or deactivates indentation. Default value is false.
\r
478 public void setIndent(boolean indent) {
\r
479 this.indent = indent;
\r
482 protected void toXml(Record record) throws SAXException {
\r
484 AttributesImpl atts = new AttributesImpl();
\r
486 handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);
\r
488 handler.startElement(Constants.TURBO_MARCXML_NS_URI, RECORD, Q_RECORD, atts);
\r
491 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
493 handler.startElement(Constants.TURBO_MARCXML_NS_URI, LEADER, Q_LEADER, atts);
\r
494 Leader leader = record.getLeader();
\r
495 temp = leader.toString().toCharArray();
\r
496 handler.characters(temp, 0, temp.length);
\r
497 handler.endElement(Constants.TURBO_MARCXML_NS_URI, LEADER, Q_LEADER);
\r
499 Iterator<ControlField> ci = record.getControlFields().iterator();
\r
500 while (ci.hasNext()) {
\r
501 ControlField field = (ControlField) ci.next();
\r
502 atts = new AttributesImpl();
\r
503 //atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());
\r
506 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
507 String elementName = CONTROL_FIELD + field.getTag();
\r
508 String qElementName = prefix + elementName;
\r
509 handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName, qElementName, atts);
\r
510 temp = getDataElement(field.getData());
\r
511 handler.characters(temp, 0, temp.length);
\r
512 handler.endElement(Constants.TURBO_MARCXML_NS_URI, elementName, qElementName);
\r
515 Iterator<DataField> di = record.getDataFields().iterator();
\r
516 while (di.hasNext()) {
\r
517 DataField field = di.next();
\r
518 atts = new AttributesImpl();
\r
519 // atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());
\r
520 atts.addAttribute("", "ind1", "ind1", "CDATA", String.valueOf(field
\r
521 .getIndicator1()));
\r
522 atts.addAttribute("", "ind2", "ind2", "CDATA", String.valueOf(field
\r
523 .getIndicator2()));
\r
526 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
527 StringBuffer elementName = new StringBuffer(DATA_FIELD);
\r
528 StringBuffer qElementName = new StringBuffer(prefix);
\r
529 elementName.append(field.getTag());
\r
530 qElementName.append(elementName);
\r
531 handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(),
\r
532 qElementName.toString(), atts);
\r
534 Iterator<Subfield> si = field.getSubfields().iterator();
\r
535 while (si.hasNext()) {
\r
536 Subfield subfield = (Subfield) si.next();
\r
537 StringBuffer subfieldName = new StringBuffer(SUBFIELD);
\r
538 StringBuffer qSubfieldName = new StringBuffer(prefix);
\r
539 qSubfieldName.append(subfieldName);
\r
541 char code = subfield.getCode();
\r
542 // if [a-zA-Z0-9] append to elementName, otherwise use a attribute
\r
543 if (code >= '0' && code <= '9' ||
\r
544 code >= 'a' && code <= 'z' ||
\r
545 code >= 'A' && code <= 'Z') {
\r
546 subfieldName.append(code);
\r
547 qSubfieldName.append(code);
\r
550 atts = new AttributesImpl();
\r
551 atts.addAttribute("", "code", "code", "CDATA", String
\r
552 .valueOf(subfield.getCode()));
\r
555 handler.ignorableWhitespace("\n ".toCharArray(), 0, 7);
\r
557 handler.startElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),
\r
558 qSubfieldName.toString(), atts);
\r
559 temp = getDataElement(subfield.getData());
\r
560 handler.characters(temp, 0, temp.length);
\r
562 .endElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),
\r
563 qSubfieldName.toString());
\r
567 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
570 .endElement(Constants.TURBO_MARCXML_NS_URI,
\r
571 elementName.toString(), qElementName.toString());
\r
575 handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);
\r
577 handler.endElement(Constants.TURBO_MARCXML_NS_URI, RECORD, Q_RECORD);
\r
580 protected char[] getDataElement(String data) {
\r
581 String dataElement = null;
\r
582 if (converter == null)
\r
583 return data.toCharArray();
\r
584 dataElement = converter.convert(data);
\r
586 dataElement = Normalizer.normalize(dataElement, Normalizer.NFC);
\r
587 return dataElement.toCharArray();
\r