1 // $Id: Iso6937ToUnicode.java,v 1.3 2008/10/17 06:47:06 haschart Exp $
\r
3 * Copyright (C) 2002 Bas Peters (mail@bpeters.com)
\r
4 * Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr)
\r
6 * This file is part of MARC4J
\r
8 * MARC4J is free software; you can redistribute it and/or
\r
9 * modify it under the terms of the GNU Lesser General Public
\r
10 * License as published by the Free Software Foundation; either
\r
11 * version 2.1 of the License, or (at your option) any later version.
\r
13 * MARC4J is distributed in the hope that it will be useful,
\r
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
16 * Lesser General Public License for more details.
\r
18 * You should have received a copy of the GNU Lesser General Public
\r
19 * License along with MARC4J; if not, write to the Free Software
\r
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
22 package org.marc4j.converter.impl;
\r
24 import org.marc4j.converter.CharConverter;
\r
28 * A utility to convert ISO 6937 data to UCS/Unicode.
\r
31 * @author Bas Peters
\r
32 * @author Yves Pratter
\r
33 * @version $Revision: 1.3 $
\r
35 public class Iso6937ToUnicode extends CharConverter {
\r
39 * Converts ISO 6937 data to UCS/Unicode.
\r
42 * @param data - the ISO 6937 data in an array of char
\r
43 * @return {@link String}- the UCS/Unicode data
\r
45 public String convert(char data[]) {
\r
46 StringBuffer sb = new StringBuffer();
\r
48 for (int i = 0; i < data.length; i++) {
\r
50 int len = data.length;
\r
53 else if (isCombining(c) && hasNext(i, len)) {
\r
54 char d = getCombiningChar(c * 256 + data[i + 1]);
\r
59 sb.append(getChar(c));
\r
62 sb.append(getChar(c));
\r
64 return sb.toString();
\r
67 private boolean hasNext(int pos, int len) {
\r
68 if (pos < (len - 1))
\r
73 private boolean isAscii(int i) {
\r
74 if (i >= 0x00 && i <= 0x7F)
\r
79 private boolean isCombining(int i) {
\r
80 if (i >= 0xC0 && i <= 0xDF)
\r
85 // Source : http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/6937cd.pdf
\r
86 private char getChar(int i) {
\r
89 return 0x00A0; // 10/00 NO-BREAK SPACE
\r
91 return 0x00A1; // 10/01 INVERTED EXCLAMATION MARK
\r
93 return 0x00A2; // 10/02 CENT SIGN
\r
95 return 0x00A3; // 10/03 POUND SIGN
\r
96 // 10/04 (This position shall not be used)
\r
98 return 0x00A5; // 10/05 YEN SIGN
\r
99 // 10/06 (This position shall not be used)
\r
101 return 0x00A7; // 10/07 SECTION SIGN
\r
103 return 0x00A4; // 10/08 CURRENCY SIGN
\r
105 return 0x2018; // 10/09 LEFT SINGLE QUOTATION MARK
\r
107 return 0x201C; // 10/10 LEFT DOUBLE QUOTATION MARK
\r
109 return 0x00AB; // 10/11 LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
\r
111 return 0x2190; // 10/12 LEFTWARDS ARROW
\r
113 return 0x2191; // 10/13 UPWARDS ARROW
\r
115 return 0x2192; // 10/14 RIGHTWARDS ARROW
\r
117 return 0x2193; // 10/15 DOWNWARDS ARROW
\r
120 return 0x00B0; // 11/00 DEGREE SIGN
\r
122 return 0x00B1; // 11/01 PLUS-MINUS SIGN
\r
124 return 0x00B2; // 11/02 SUPERSCRIPT TWO
\r
126 return 0x00B3; // 11/03 SUPERSCRIPT THREE
\r
128 return 0x00D7; // 11/04 MULTIPLICATION SIGN
\r
130 return 0x00B5; // 11/05 MICRO SIGN
\r
132 return 0x00B6; // 11/06 PILCROW SIGN
\r
134 return 0x00B7; // 11/07 MIDDLE DOT
\r
136 return 0x00F7; // 11/08 DIVISION SIGN
\r
138 return 0x2019; // 11/09 RIGHT SINGLE QUOTATION MARK
\r
140 return 0x201D; // 11/10 RIGHT DOUBLE QUOTATION MARK
\r
142 return 0x00BB; // 11/11 RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
\r
144 return 0x00BC; // 11/12 VULGAR FRACTION ONE QUARTER
\r
146 return 0x00BD; // 11/13 VULGAR FRACTION ONE HALF
\r
148 return 0x00BE; // 11/14 VULGAR FRACTION THREE QUARTERS
\r
150 return 0x00BF; // 11/15 INVERTED QUESTION MARK
\r
152 // 4/0 to 5/15 diacritic characters
\r
155 return 0x2015; // 13/00 HORIZONTAL BAR
\r
157 return 0x00B9; // 13/01 SUPERSCRIPT ONE
\r
159 return 0x2117; // 13/02 REGISTERED SIGN
\r
161 return 0x00A9; // 13/03 COPYRIGHT SIGN
\r
163 return 0x00AE; // 13/04 TRADE MARK SIGN
\r
165 return 0x266A; // 13/05 EIGHTH NOTE
\r
167 return 0x00AC; // 13/06 NOT SIGN
\r
169 return 0x00A6; // 13/07 BROKEN BAR
\r
170 // 13/08 (This position shall not be used)
\r
171 // 13/09 (This position shall not be used)
\r
172 // 13/10 (This position shall not be used)
\r
173 // 13/11 (This position shall not be used)
\r
175 return 0x215B; // 13/12 VULGAR FRACTION ONE EIGHTH
\r
177 return 0x215E; // 13/15 VULGAR FRACTION SEVEN EIGHTHS
\r
180 return 0x2126; // 14/00 OHM SIGN
\r
182 return 0x00C6; // 14/01 LATIN CAPITAL LETTER AE
\r
184 return 0x0110; // 14/02 LATIN CAPITAL LETTER D WITH STROKE
\r
186 return 0x00AA; // 14/03 FEMININE ORDINAL INDICATOR
\r
188 return 0x0126; // 14/04 LATIN CAPITAL LETTER H WITH STROKE
\r
189 // 14/05 (This position shall not be used)
\r
191 return 0x0132; // 14/06 LATIN CAPITAL LIGATURE IJ
\r
193 return 0x013F; // 14/07 LATIN CAPITAL LETTER L WITH MIDDLE DOT
\r
195 return 0x0141; // 14/08 LATIN CAPITAL LETTER L WITH STROKE
\r
197 return 0x00D8; // 14/09 LATIN CAPITAL LETTER O WITH STROKE
\r
199 return 0x0152; // 14/10 LATIN CAPITAL LIGATURE OE
\r
201 return 0x00BA; // 14/11 MASCULINE ORDINAL INDICATOR
\r
203 return 0x00DE; // 14/12 LATIN CAPITAL LETTER THORN
\r
205 return 0x0166; // 14/13 LATIN CAPITAL LETTER T WITH STROKE
\r
207 return 0x014A; // 14/14 LATIN CAPITAL LETTER ENG
\r
209 return 0x0149; // 14/15 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
\r
212 return 0x0138; // 15/00 LATIN SMALL LETTER KRA
\r
214 return 0x00E6; // 15/01 LATIN SMALL LETTER AE
\r
216 return 0x0111; // 15/02 LATIN SMALL LETTER D WITH STROKE
\r
218 return 0x00F0; // 15/03 LATIN SMALL LETTER ETH
\r
220 return 0x0127; // 15/04 LATIN SMALL LETTER H WITH STROKE
\r
222 return 0x0131; // 15/05 LATIN SMALL LETTER DOTLESS I
\r
224 return 0x0133; // 15/06 LATIN SMALL LIGATURE IJ
\r
226 return 0x0140; // 15/07 LATIN SMALL LETTER L WITH MIDDLE DOT
\r
228 return 0x0142; // 15/08 LATIN SMALL LETTER L WITH STROKE
\r
230 return 0x00F8; // 15/09 LATIN SMALL LETTER O WITH STROKE
\r
232 return 0x0153; // 15/10 LATIN SMALL LIGATURE OE
\r
234 return 0x00DF; // 15/11 LATIN SMALL LETTER SHARP S
\r
236 return 0x00FE; // 15/12 LATIN SMALL LETTER THORN
\r
238 return 0x0167; // 15/13 LATIN SMALL LETTER T WITH STROKE
\r
240 return 0x014B; // 15/14 LATIN SMALL LETTER ENG
\r
242 return 0x00AD; // 15/15 SOFT HYPHEN$
\r
249 private char getCombiningChar(int i) {
\r
251 // 12/00 (This position shall not be used)
\r
253 // 12/01 non-spacing grave accent
\r
255 return 0x00C0; // LATIN CAPITAL LETTER A WITH GRAVE
\r
257 return 0x00C8; // LATIN CAPITAL LETTER E WITH GRAVE
\r
259 return 0x00CC; // LATIN CAPITAL LETTER I WITH GRAVE
\r
261 return 0x00D2; // LATIN CAPITAL LETTER O WITH GRAVE
\r
263 return 0x00D9; // LATIN CAPITAL LETTER U WITH GRAVE
\r
265 return 0x00E0; // LATIN SMALL LETTER A WITH GRAVE
\r
267 return 0x00E8; // LATIN SMALL LETTER E WITH GRAVE
\r
269 return 0x00EC; // LATIN SMALL LETTER I WITH GRAVE
\r
271 return 0x00F2; // LATIN SMALL LETTER O WITH GRAVE
\r
273 return 0x00F9; // LATIN SMALL LETTER U WITH GRAVE
\r
275 // 12/02 non-spacing acute accent
\r
277 return 0x00B4; // ACUTE ACCENT
\r
279 return 0x00C1; // LATIN CAPITAL LETTER A WITH ACUTE
\r
281 return 0x0106; // LATIN CAPITAL LETTER C WITH ACUTE
\r
283 return 0x00C9; // LATIN CAPITAL LETTER E WITH ACUTE
\r
285 return 0x00CD; // LATIN CAPITAL LETTER I WITH ACUTE
\r
287 return 0x0139; // LATIN CAPITAL LETTER L WITH ACUTE
\r
289 return 0x0143; // LATIN CAPITAL LETTER N WITH ACUTE
\r
291 return 0x00D3; // LATIN CAPITAL LETTER O WITH ACUTE
\r
293 return 0x0154; // LATIN CAPITAL LETTER R WITH ACUTE
\r
295 return 0x015A; // LATIN CAPITAL LETTER S WITH ACUTE
\r
297 return 0x00DA; // LATIN CAPITAL LETTER U WITH ACUTE
\r
299 return 0x00DD; // LATIN CAPITAL LETTER Y WITH ACUTE
\r
301 return 0x0179; // LATIN CAPITAL LETTER Z WITH ACUTE
\r
303 return 0x00E1; // LATIN SMALL LETTER A WITH ACUTE
\r
305 return 0x0107; // LATIN SMALL LETTER C WITH ACUTE
\r
307 return 0x00E9; // LATIN SMALL LETTER E WITH ACUTE
\r
309 return 0x01F5; // LATIN SMALL LETTER G WITH CEDILLA(4)
\r
311 return 0x00ED; // LATIN SMALL LETTER I WITH ACUTE
\r
313 return 0x013A; // LATIN SMALL LETTER L WITH ACUTE
\r
315 return 0x0144; // LATIN SMALL LETTER N WITH ACUTE
\r
317 return 0x00F3; // LATIN SMALL LETTER O WITH ACUTE
\r
319 return 0x0155; // LATIN SMALL LETTER R WITH ACUTE
\r
321 return 0x015B; // LATIN SMALL LETTER S WITH ACUTE
\r
323 return 0x00FA; // LATIN SMALL LETTER U WITH ACUTE
\r
325 return 0x00FD; // LATIN SMALL LETTER Y WITH ACUTE
\r
327 return 0x017A; // LATIN SMALL LETTER Z WITH ACUTE
\r
329 // 12/03 non-spacing circumflex accent
\r
331 return 0x00C2; // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
\r
333 return 0x0108; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
\r
335 return 0x00CA; // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
\r
337 return 0x011C; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
\r
339 return 0x0124; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
\r
341 return 0x00CE; // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
\r
343 return 0x0134; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
\r
345 return 0x00D4; // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
\r
347 return 0x015C; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
\r
349 return 0x00DB; // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
\r
351 return 0x0174; // LATIN CAPITAL LETTER W WITH CIRCUMFLEX
\r
353 return 0x0176; // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
\r
355 return 0x00E2; // LATIN SMALL LETTER A WITH CIRCUMFLEX
\r
357 return 0x0109; // LATIN SMALL LETTER C WITH CIRCUMFLEX
\r
359 return 0x00EA; // LATIN SMALL LETTER E WITH CIRCUMFLEX
\r
361 return 0x011D; // LATIN SMALL LETTER G WITH CIRCUMFLEX
\r
363 return 0x0125; // LATIN SMALL LETTER H WITH CIRCUMFLEX
\r
365 return 0x00EE; // LATIN SMALL LETTER I WITH CIRCUMFLEX
\r
367 return 0x0135; // LATIN SMALL LETTER J WITH CIRCUMFLEX
\r
369 return 0x00F4; // LATIN SMALL LETTER O WITH CIRCUMFLEX
\r
371 return 0x015D; // LATIN SMALL LETTER S WITH CIRCUMFLEX
\r
373 return 0x00FB; // LATIN SMALL LETTER U WITH CIRCUMFLEX
\r
375 return 0x0175; // LATIN SMALL LETTER W WITH CIRCUMFLEX
\r
377 return 0x0177; // LATIN SMALL LETTER Y WITH CIRCUMFLEX
\r
379 // 12/04 non-spacing tilde
\r
381 return 0x00C3; // LATIN CAPITAL LETTER A WITH TILDE
\r
383 return 0x0128; // LATIN CAPITAL LETTER I WITH TILDE
\r
385 return 0x00D1; // LATIN CAPITAL LETTER N WITH TILDE
\r
387 return 0x00D5; // LATIN CAPITAL LETTER O WITH TILDE
\r
389 return 0x0168; // LATIN CAPITAL LETTER U WITH TILDE
\r
391 return 0x00E3; // LATIN SMALL LETTER A WITH TILDE
\r
393 return 0x0129; // LATIN SMALL LETTER I WITH TILDE
\r
395 return 0x00F1; // LATIN SMALL LETTER N WITH TILDE
\r
397 return 0x00F5; // LATIN SMALL LETTER O WITH TILDE
\r
399 return 0x0169; // LATIN SMALL LETTER U WITH TILDE
\r
401 // 12/05 non-spacing macron
\r
403 return 0x0100; // LATIN CAPITAL LETTER A WITH MACRON
\r
405 return 0x0112; // LATIN CAPITAL LETTER E WITH MACRON
\r
407 return 0x012A; // LATIN CAPITAL LETTER I WITH MACRON
\r
409 return 0x014C; // LATIN CAPITAL LETTER O WITH MACRON
\r
411 return 0x016A; // LATIN CAPITAL LETTER U WITH MACRON
\r
413 return 0x0101; // LATIN SMALL LETTER A WITH MACRON
\r
415 return 0x0113; // LATIN SMALL LETTER E WITH MACRON
\r
417 return 0x012B; // LATIN SMALL LETTER I WITH MACRON
\r
419 return 0x014D; // LATIN SMALL LETTER O WITH MACRON
\r
421 return 0x016B; // LATIN SMALL LETTER U WITH MACRON
\r
423 // 12/06 non-spacing breve
\r
425 return 0x02D8; // BREVE
\r
427 return 0x0102; // LATIN CAPITAL LETTER A WITH BREVE
\r
429 return 0x011E; // LATIN CAPITAL LETTER G WITH BREVE
\r
431 return 0x016C; // LATIN CAPITAL LETTER U WITH BREVE
\r
433 return 0x0103; // LATIN SMALL LETTER A WITH BREVE
\r
435 return 0x011F; // LATIN SMALL LETTER G WITH BREVE
\r
437 return 0x016D; // LATIN SMALL LETTER U WITH BREVE
\r
439 // 12/07 non-spacing dot above
\r
441 return 0x010A; // LATIN CAPITAL LETTER C WITH DOT ABOVE
\r
443 return 0x0116; // LATIN CAPITAL LETTER E WITH DOT ABOVE
\r
445 return 0x0120; // LATIN CAPITAL LETTER G WITH DOT ABOVE
\r
447 return 0x0130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
\r
449 return 0x017B; // LATIN CAPITAL LETTER Z WITH DOT ABOVE
\r
451 return 0x010B; // LATIN SMALL LETTER C WITH DOT ABOVE
\r
453 return 0x0117; // LATIN SMALL LETTER E WITH DOT ABOVE
\r
455 return 0x0121; // LATIN SMALL LETTER G WITH DOT ABOVE
\r
457 return 0x017C; // LATIN SMALL LETTER Z WITH DOT ABOVE
\r
459 // 12/08 non-spacing diaeresis
\r
461 return 0x00A8; // DIAERESIS
\r
463 return 0x00C4; // LATIN CAPITAL LETTER A WITH DIAERESIS
\r
465 return 0x00CB; // LATIN CAPITAL LETTER E WITH DIAERESIS
\r
467 return 0x00CF; // LATIN CAPITAL LETTER I WITH DIAERESIS
\r
469 return 0x00D6; // LATIN CAPITAL LETTER O WITH DIAERESIS
\r
471 return 0x00DC; // LATIN CAPITAL LETTER U WITH DIAERESIS
\r
473 return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
\r
475 return 0x00E4; // LATIN SMALL LETTER A WITH DIAERESIS
\r
477 return 0x00EB; // LATIN SMALL LETTER E WITH DIAERESIS
\r
479 return 0x00EF; // LATIN SMALL LETTER I WITH DIAERESIS
\r
481 return 0x00F6; // LATIN SMALL LETTER O WITH DIAERESIS
\r
483 return 0x00FC; // LATIN SMALL LETTER U WITH DIAERESIS
\r
485 return 0x00FF; // LATIN SMALL LETTER Y WITH DIAERESIS
\r
487 // 12/09 (This position shall not be used)
\r
489 // 12/10 non-spacing ring above
\r
491 return 0x02DA; // RING ABOVE
\r
493 return 0x00C5; // LATIN CAPITAL LETTER A WITH RING ABOVE
\r
495 return 0x016E; // LATIN CAPITAL LETTER U WITH RING ABOVE
\r
497 return 0x00E5; // LATIN SMALL LETTER A WITH RING ABOVE
\r
499 return 0x016F; // LATIN SMALL LETTER U WITH RING ABOVE
\r
501 // 12/11 non-spacing cedilla
\r
503 return 0x00B8; // CEDILLA
\r
505 return 0x00C7; // LATIN CAPITAL LETTER C WITH CEDILLA
\r
507 return 0x0122; // LATIN CAPITAL LETTER G WITH CEDILLA
\r
509 return 0x0136; // LATIN CAPITAL LETTER K WITH CEDILLA
\r
511 return 0x013B; // LATIN CAPITAL LETTER L WITH CEDILLA
\r
513 return 0x0145; // LATIN CAPITAL LETTER N WITH CEDILLA
\r
515 return 0x0156; // LATIN CAPITAL LETTER R WITH CEDILLA
\r
517 return 0x015E; // LATIN CAPITAL LETTER S WITH CEDILLA
\r
519 return 0x0162; // LATIN CAPITAL LETTER T WITH CEDILLA
\r
521 return 0x00E7; // LATIN SMALL LETTER C WITH CEDILLA
\r
522 // case 0xCB67: return 0x0123; // small g with cedilla
\r
524 return 0x0137; // LATIN SMALL LETTER K WITH CEDILLA
\r
526 return 0x013C; // LATIN SMALL LETTER L WITH CEDILLA
\r
528 return 0x0146; // LATIN SMALL LETTER N WITH CEDILLA
\r
530 return 0x0157; // LATIN SMALL LETTER R WITH CEDILLA
\r
532 return 0x015F; // LATIN SMALL LETTER S WITH CEDILLA
\r
534 return 0x0163; // LATIN SMALL LETTER T WITH CEDILLA
\r
536 // 12/12 (This position shall not be used)
\r
538 // 12/13 non-spacing double acute accent
\r
540 return 0x0150; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
\r
542 return 0x0170; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
\r
544 return 0x0151; // LATIN SMALL LETTER O WITH DOUBLE ACUTE
\r
546 return 0x0171; // LATIN SMALL LETTER U WITH DOUBLE ACUTE
\r
548 // 12/14 non-spacing ogonek
\r
550 return 0x02DB; // ogonek
\r
552 return 0x0104; // LATIN CAPITAL LETTER A WITH OGONEK
\r
554 return 0x0118; // LATIN CAPITAL LETTER E WITH OGONEK
\r
556 return 0x012E; // LATIN CAPITAL LETTER I WITH OGONEK
\r
558 return 0x0172; // LATIN CAPITAL LETTER U WITH OGONEK
\r
560 return 0x0105; // LATIN SMALL LETTER A WITH OGONEK
\r
562 return 0x0119; // LATIN SMALL LETTER E WITH OGONEK
\r
564 return 0x012F; // LATIN SMALL LETTER I WITH OGONEK
\r
566 return 0x0173; // LATIN SMALL LETTER U WITH OGONEK
\r
568 // 12/15 non-spacing caron
\r
570 return 0x02C7; // CARON
\r
572 return 0x010C; // LATIN CAPITAL LETTER C WITH CARON
\r
574 return 0x010E; // LATIN CAPITAL LETTER D WITH CARON
\r
576 return 0x011A; // LATIN CAPITAL LETTER E WITH CARON
\r
578 return 0x013D; // LATIN CAPITAL LETTER L WITH CARON
\r
580 return 0x0147; // LATIN CAPITAL LETTER N WITH CARON
\r
582 return 0x0158; // LATIN CAPITAL LETTER R WITH CARON
\r
584 return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
\r
586 return 0x0164; // LATIN CAPITAL LETTER T WITH CARON
\r
588 return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
\r
590 return 0x010D; // LATIN SMALL LETTER C WITH CARON
\r
592 return 0x010F; // LATIN SMALL LETTER D WITH CARON
\r
594 return 0x011B; // LATIN SMALL LETTER E WITH CARON
\r
596 return 0x013E; // LATIN SMALL LETTER L WITH CARON
\r
598 return 0x0148; // LATIN SMALL LETTER N WITH CARON
\r
600 return 0x0159; // LATIN SMALL LETTER R WITH CARON
\r
602 return 0x0161; // LATIN SMALL LETTER S WITH CARON
\r
604 return 0x0165; // LATIN SMALL LETTER T WITH CARON
\r
606 return 0x017E; // LATIN SMALL LETTER Z WITH CARON
\r