1 //$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $
\r
3 * Copyright (C) 2004 Bas Peters
\r
5 * This file is part of MARC4J
\r
7 * MARC4J is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU Lesser General Public
\r
9 * License as published by the Free Software Foundation; either
\r
10 * version 2.1 of the License, or (at your option) any later version.
\r
12 * MARC4J is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
15 * Lesser General Public License for more details.
\r
17 * You should have received a copy of the GNU Lesser General Public
\r
18 * License along with MARC4J; if not, write to the Free Software
\r
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
23 import java.io.BufferedWriter;
\r
24 import java.io.IOException;
\r
25 import java.io.OutputStream;
\r
26 import java.io.OutputStreamWriter;
\r
27 import java.io.UnsupportedEncodingException;
\r
28 import java.io.Writer;
\r
29 import java.util.Iterator;
\r
31 import javax.xml.transform.OutputKeys;
\r
32 import javax.xml.transform.Result;
\r
33 import javax.xml.transform.Source;
\r
34 import javax.xml.transform.TransformerFactory;
\r
35 import javax.xml.transform.sax.SAXTransformerFactory;
\r
36 import javax.xml.transform.sax.TransformerHandler;
\r
37 import javax.xml.transform.stream.StreamResult;
\r
38 import javax.xml.transform.stream.StreamSource;
\r
40 import org.marc4j.converter.CharConverter;
\r
41 import org.marc4j.marc.ControlField;
\r
42 import org.marc4j.marc.DataField;
\r
43 import org.marc4j.marc.Leader;
\r
44 import org.marc4j.marc.Record;
\r
45 import org.marc4j.marc.Subfield;
\r
46 import org.xml.sax.SAXException;
\r
47 import org.xml.sax.helpers.AttributesImpl;
\r
49 import com.ibm.icu.text.Normalizer;
\r
52 * Class for writing MARC record objects in MARCXML format. This class outputs a
\r
53 * SAX event stream to the given {@link java.io.OutputStream} or
\r
54 * {@link javax.xml.transform.Result} object. It can be used in a SAX
\r
55 * pipeline to postprocess the result. By default this class uses a nulll
\r
56 * transform. It is strongly recommended to use a dedicated XML serializer.
\r
59 * This class requires a JAXP compliant XML parser and XSLT processor. The
\r
60 * underlying SAX2 parser should be namespace aware. In addition this class
\r
61 * requires <a href="http://icu.sourceforge.net/">ICU4J </a> to perform Unicode
\r
62 * normalization. A stripped down version of 2.6 originating from the <a
\r
63 * href="http://www.cafeconleche.org/XOM/">XOM </a> project is included in this
\r
67 * The following example reads a file with MARC records and writes MARCXML
\r
68 * records in UTF-8 encoding to the console:
\r
73 * InputStream input = new FileInputStream("input.mrc")
\r
74 * MarcReader reader = new MarcStreamReader(input);
\r
76 * MarcWriter writer = new MarcXmlWriter(System.out, true);
\r
77 * while (reader.hasNext()) {
\r
78 * Record record = reader.next();
\r
79 * writer.write(record);
\r
86 * To perform a character conversion like MARC-8 to UCS/Unicode register a
\r
87 * <code>CharConverter</code>:
\r
91 * writer.setConverter(new AnselToUnicode());
\r
95 * In addition you can perform Unicode normalization. This is for example not
\r
96 * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text
\r
97 * is transformed into the canonical composed form. For example "a´bc"
\r
98 * is normalized to "ábc". To perform normalization set Unicode
\r
99 * normalization to true:
\r
103 * writer.setUnicodeNormalization(true);
\r
107 * Please note that it's not garanteed to work if you try to convert normalized
\r
108 * Unicode back to MARC-8 encoding using
\r
109 * {@link org.marc4j.converter.impl.UnicodeToAnsel}.
\r
112 * This class provides very basic formatting options. For more advanced options
\r
113 * create an instance of this class with a
\r
114 * {@link javax.xml.transform.sax.SAXResult} containing a
\r
115 * {@link org.xml.sax.ContentHandler} derived from a dedicated XML
\r
120 * The following example uses
\r
121 * <code>org.apache.xml.serialize.XMLSerializer</code> to write MARC records
\r
122 * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization:
\r
127 * InputStream input = new FileInputStream("input.mrc")
\r
128 * MarcReader reader = new MarcStreamReader(input);
\r
130 * OutputFormat format = new OutputFormat("xml","UTF-8", true);
\r
131 * OutputStream out = new FileOutputStream("output.xml");
\r
132 * XMLSerializer serializer = new XMLSerializer(out, format);
\r
133 * Result result = new SAXResult(serializer.asContentHandler());
\r
135 * MarcXmlWriter writer = new MarcXmlWriter(result);
\r
136 * writer.setConverter(new AnselToUnicode());
\r
137 * while (reader.hasNext()) {
\r
138 * Record record = reader.next();
\r
139 * writer.write(record);
\r
146 * You can post-process the result using a <code>Source</code> object pointing
\r
147 * to a stylesheet resource and a <code>Result</code> object to hold the
\r
148 * transformation result tree. The example below converts MARC to MARCXML and
\r
149 * transforms the result tree to MODS using the stylesheet provided by The
\r
150 * Library of Congress:
\r
155 * String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl";
\r
156 * Source stylesheet = new StreamSource(stylesheetUrl);
\r
158 * Result result = new StreamResult(System.out);
\r
160 * InputStream input = new FileInputStream("input.mrc")
\r
161 * MarcReader reader = new MarcStreamReader(input);
\r
162 * MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);
\r
163 * writer.setConverter(new AnselToUnicode());
\r
164 * while (reader.hasNext()) {
\r
165 * Record record = (Record) reader.next();
\r
166 * writer.write(record);
\r
173 * It is also possible to write the result into a DOM Node:
\r
178 * InputStream input = new FileInputStream("input.mrc")
\r
179 * MarcReader reader = new MarcStreamReader(input);
\r
180 * DOMResult result = new DOMResult();
\r
181 * MarcXmlWriter writer = new MarcXmlWriter(result);
\r
182 * writer.setConverter(new AnselToUnicode());
\r
183 * while (reader.hasNext()) {
\r
184 * Record record = (Record) reader.next();
\r
185 * writer.write(record);
\r
189 * Document doc = (Document) result.getNode();
\r
193 * @author Bas Peters
\r
194 * @version $Revision: 1.9 $
\r
197 public class MarcXmlWriter implements MarcWriter {
\r
199 protected static final String prefix = "marc:";
\r
200 protected static final String CONTROL_FIELD = "controlfield";
\r
201 protected static final String Q_CONTROL_FIELD = prefix + "controlfield";
\r
203 protected static final String DATA_FIELD = "datafield";
\r
204 protected static final String Q_DATA_FIELD = prefix + "datafield";
\r
207 protected static final String SUBFIELD = "subfield";
\r
208 protected static final String Q_SUBFIELD = prefix + "subfield";
\r
210 protected static final String COLLECTION = "collection";
\r
211 protected static final String Q_COLLECTION = prefix + "collection";
\r
213 protected static final String RECORD = "record";
\r
214 protected static final String Q_RECORD = prefix + "record";
\r
216 protected static final String LEADER = "leader";
\r
217 protected static final String Q_LEADER = prefix + "leader";
\r
219 private boolean indent = false;
\r
221 private TransformerHandler handler = null;
\r
223 private Writer writer = null;
\r
227 * Character encoding. Default is UTF-8.
\r
229 //private String encoding = "UTF8";
\r
231 private CharConverter converter = null;
\r
233 private boolean normalize = false;
\r
236 * Constructs an instance with the specified output stream.
\r
238 * The default character encoding for UTF-8 is used.
\r
240 * @throws MarcException
\r
242 public MarcXmlWriter(OutputStream out) {
\r
247 * Constructs an instance with the specified output stream and indentation.
\r
249 * The default character encoding for UTF-8 is used.
\r
251 * @throws MarcException
\r
253 public MarcXmlWriter(OutputStream out, boolean indent) {
\r
254 this(out, "UTF8", indent);
\r
258 * Constructs an instance with the specified output stream and character
\r
261 * @throws MarcException
\r
263 public MarcXmlWriter(OutputStream out, String encoding) {
\r
264 this(out, encoding, false);
\r
268 * Constructs an instance with the specified output stream, character
\r
269 * encoding and indentation.
\r
271 * @throws MarcException
\r
273 public MarcXmlWriter(OutputStream out, String encoding, boolean indent) {
\r
275 throw new NullPointerException("null OutputStream");
\r
277 if (encoding == null) {
\r
278 throw new NullPointerException("null encoding");
\r
282 writer = new OutputStreamWriter(out, encoding);
\r
283 writer = new BufferedWriter(writer);
\r
284 // this.encoding = encoding;
\r
285 setHandler(new StreamResult(writer), null);
\r
286 } catch (UnsupportedEncodingException e) {
\r
287 throw new MarcException(e.getMessage(), e);
\r
289 writeStartDocument();
\r
293 * Constructs an instance with the specified result.
\r
296 * @throws SAXException
\r
298 public MarcXmlWriter(Result result) {
\r
299 if (result == null)
\r
300 throw new NullPointerException("null Result");
\r
301 setHandler(result, null);
\r
302 writeStartDocument();
\r
306 * Constructs an instance with the specified stylesheet location and result.
\r
309 * @throws SAXException
\r
311 public MarcXmlWriter(Result result, String stylesheetUrl) {
\r
312 this(result, new StreamSource(stylesheetUrl));
\r
316 * Constructs an instance with the specified stylesheet source and result.
\r
319 * @throws SAXException
\r
321 public MarcXmlWriter(Result result, Source stylesheet) {
\r
322 if (stylesheet == null)
\r
323 throw new NullPointerException("null Source");
\r
324 if (result == null)
\r
325 throw new NullPointerException("null Result");
\r
326 setHandler(result, stylesheet);
\r
327 writeStartDocument();
\r
330 public void close() {
\r
331 writeEndDocument();
\r
333 if (writer != null)
\r
335 } catch (IOException e) {
\r
336 throw new MarcException(e.getMessage(), e);
\r
341 * Returns the character converter.
\r
343 * @return CharConverter the character converter
\r
345 public CharConverter getConverter() {
\r
350 * Sets the character converter.
\r
353 * the character converter
\r
355 public void setConverter(CharConverter converter) {
\r
356 this.converter = converter;
\r
360 * If set to true this writer will perform Unicode normalization on data
\r
361 * elements using normalization form C (NFC). The default is false.
\r
363 * The implementation used is ICU4J 2.6. This version is based on Unicode
\r
367 * true if this writer performs Unicode normalization, false
\r
370 public void setUnicodeNormalization(boolean normalize) {
\r
371 this.normalize = normalize;
\r
375 * Returns true if this writer will perform Unicode normalization, false
\r
378 * @return boolean - true if this writer performs Unicode normalization,
\r
381 public boolean getUnicodeNormalization() {
\r
385 protected void setHandler(Result result, Source stylesheet)
\r
386 throws MarcException {
\r
388 TransformerFactory factory = TransformerFactory.newInstance();
\r
389 if (!factory.getFeature(SAXTransformerFactory.FEATURE))
\r
390 throw new UnsupportedOperationException(
\r
391 "SAXTransformerFactory is not supported");
\r
393 SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory;
\r
394 //saxFactory.setFeature("http://xml.org/sax/features/namespaces", false);
\r
395 if (stylesheet == null)
\r
396 handler = saxFactory.newTransformerHandler();
\r
398 handler = saxFactory.newTransformerHandler(stylesheet);
\r
399 handler.getTransformer()
\r
400 .setOutputProperty(OutputKeys.METHOD, "xml");
\r
401 handler.setResult(result);
\r
403 } catch (Exception e) {
\r
404 throw new MarcException(e.getMessage(), e);
\r
409 * Writes the root start tag to the result.
\r
411 * @throws SAXException
\r
413 protected void writeStartDocument() {
\r
415 AttributesImpl atts = new AttributesImpl();
\r
416 handler.startDocument();
\r
417 // The next line duplicates the namespace declaration for Marc XML
\r
418 handler.startPrefixMapping("marc", Constants.MARCXML_NS_URI);
\r
419 // add namespace declaration using attribute - need better solution
\r
420 atts.addAttribute(Constants.MARCXML_NS_URI, "xmlns", "xmlns:marc",
\r
421 "CDATA", Constants.MARCXML_NS_URI);
\r
422 handler.startElement(Constants.MARCXML_NS_URI, COLLECTION, Q_COLLECTION, atts);
\r
423 } catch (SAXException e) {
\r
424 throw new MarcException(
\r
425 "SAX error occured while writing start document", e);
\r
430 * Writes the root end tag to the result.
\r
432 * @throws SAXException
\r
434 protected void writeEndDocument() {
\r
437 handler.ignorableWhitespace("\n".toCharArray(), 0, 1);
\r
440 .endElement(Constants.MARCXML_NS_URI, COLLECTION,
\r
442 handler.endPrefixMapping("");
\r
443 handler.endDocument();
\r
444 } catch (SAXException e) {
\r
445 throw new MarcException(
\r
446 "SAX error occured while writing end document", e);
\r
451 * Writes a Record object to the result.
\r
454 * the <code>Record</code> object
\r
455 * @throws SAXException
\r
457 public void write(Record record) {
\r
460 } catch (SAXException e) {
\r
461 throw new MarcException("SAX error occured while writing record", e);
\r
466 * Returns true if indentation is active, false otherwise.
\r
470 public boolean hasIndent() {
\r
475 * Activates or deactivates indentation. Default value is false.
\r
479 public void setIndent(boolean indent) {
\r
480 this.indent = indent;
\r
483 protected void toXml(Record record) throws SAXException {
\r
485 AttributesImpl atts = new AttributesImpl();
\r
487 handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);
\r
489 handler.startElement(Constants.MARCXML_NS_URI, RECORD, Q_RECORD, atts);
\r
492 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
494 handler.startElement(Constants.MARCXML_NS_URI, LEADER, Q_LEADER, atts);
\r
495 Leader leader = record.getLeader();
\r
496 temp = leader.toString().toCharArray();
\r
497 handler.characters(temp, 0, temp.length);
\r
498 handler.endElement(Constants.MARCXML_NS_URI, LEADER, Q_LEADER);
\r
500 Iterator<ControlField> ci = record.getControlFields().iterator();
\r
501 while (ci.hasNext()) {
\r
502 ControlField field = (ControlField) ci.next();
\r
503 atts = new AttributesImpl();
\r
504 atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());
\r
507 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
509 handler.startElement(Constants.MARCXML_NS_URI, CONTROL_FIELD,
\r
510 Q_CONTROL_FIELD, atts);
\r
511 temp = getDataElement(field.getData());
\r
512 handler.characters(temp, 0, temp.length);
\r
513 handler.endElement(Constants.MARCXML_NS_URI, CONTROL_FIELD,
\r
517 Iterator<DataField> di = record.getDataFields().iterator();
\r
518 while (di.hasNext()) {
\r
519 DataField field = di.next();
\r
520 atts = new AttributesImpl();
\r
521 atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());
\r
522 atts.addAttribute("", "ind1", "ind1", "CDATA", String.valueOf(field
\r
523 .getIndicator1()));
\r
524 atts.addAttribute("", "ind2", "ind2", "CDATA", String.valueOf(field
\r
525 .getIndicator2()));
\r
528 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
530 handler.startElement(Constants.MARCXML_NS_URI, DATA_FIELD,
\r
531 Q_DATA_FIELD, atts);
\r
532 Iterator<Subfield> si = field.getSubfields().iterator();
\r
533 while (si.hasNext()) {
\r
534 Subfield subfield = (Subfield) si.next();
\r
535 atts = new AttributesImpl();
\r
536 atts.addAttribute("", "code", "code", "CDATA", String
\r
537 .valueOf(subfield.getCode()));
\r
540 handler.ignorableWhitespace("\n ".toCharArray(), 0, 7);
\r
542 handler.startElement(Constants.MARCXML_NS_URI, SUBFIELD,
\r
544 temp = getDataElement(subfield.getData());
\r
545 handler.characters(temp, 0, temp.length);
\r
547 .endElement(Constants.MARCXML_NS_URI, SUBFIELD,
\r
552 handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);
\r
555 .endElement(Constants.MARCXML_NS_URI, DATA_FIELD,
\r
560 handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);
\r
562 handler.endElement(Constants.MARCXML_NS_URI, RECORD, Q_RECORD);
\r
565 protected char[] getDataElement(String data) {
\r
566 String dataElement = null;
\r
567 if (converter == null)
\r
568 return data.toCharArray();
\r
569 dataElement = converter.convert(data);
\r
571 dataElement = Normalizer.normalize(dataElement, Normalizer.NFC);
\r
572 return dataElement.toCharArray();
\r