2 * Copyright (C) 1995-2007, Index Data ApS
3 * See the file LICENSE for details.
5 * $Id: siconv.c,v 1.35 2007-03-12 10:59:59 adam Exp $
9 * \brief Implements simple ICONV
11 * This implements an interface similar to that of iconv and
12 * is used by YAZ to interface with iconv (if present).
13 * For systems where iconv is not present, this layer
14 * provides a few important conversions: UTF-8, MARC-8, Latin-1.
17 * http://www.loc.gov/marc/specifications/speccharmarc8.html
37 #include <yaz/yaz-util.h>
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44 size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46 size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48 size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50 size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52 size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54 size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56 size_t *no_read, int *combining);
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76 size_t *no_read, int *combining);
78 struct yaz_iconv_struct {
81 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82 size_t inbytesleft, size_t *no_read);
83 unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84 size_t inbytesleft, size_t *no_read);
85 size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86 char **outbuf, size_t *outbytesleft,
88 size_t (*flush_handle)(yaz_iconv_t cd,
89 char **outbuf, size_t *outbytesleft);
94 unsigned long comb_x[8];
95 size_t comb_no_read[8];
97 unsigned long unget_x;
101 unsigned long compose_char;
103 unsigned long write_marc8_comb_ch[8];
104 size_t write_marc8_comb_no;
105 unsigned write_marc8_second_half_char;
106 unsigned long write_marc8_last;
107 const char *write_marc8_page_chr;
111 unsigned long x1, x2;
114 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
121 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136 /* omitted: 0xd7 MULTIPLICATION SIGN */
137 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
138 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
144 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
145 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151 /* omitted: 0xe6 LATIN SMALL LETTER AE */
152 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
162 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168 /* omitted: 0xf7 DIVISION SIGN */
169 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
170 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175 /* omitted: 0xfe LATIN SMALL LETTER THORN */
176 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182 size_t inbytesleft, size_t *no_read)
184 unsigned long x = inp[0];
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191 size_t inbytesleft, size_t *no_read)
200 cd->my_errno = YAZ_ICONV_EINVAL;
203 if (inp[1] != 0xbb && inp[2] == 0xbf)
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211 size_t inbytesleft, size_t *no_read,
221 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
224 *error = YAZ_ICONV_EILSEQ;
226 else if (inp[0] <= 0xdf && inbytesleft >= 2)
228 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
234 *error = YAZ_ICONV_EILSEQ;
237 else if (inp[0] <= 0xef && inbytesleft >= 3)
239 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
246 *error = YAZ_ICONV_EILSEQ;
249 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
251 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
258 *error = YAZ_ICONV_EILSEQ;
261 else if (inp[0] <= 0xfb && inbytesleft >= 5)
263 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
271 *error = YAZ_ICONV_EILSEQ;
274 else if (inp[0] <= 0xfd && inbytesleft >= 6)
276 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
284 *error = YAZ_ICONV_EILSEQ;
290 *error = YAZ_ICONV_EINVAL;
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296 size_t inbytesleft, size_t *no_read)
298 return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302 size_t inbytesleft, size_t *no_read)
308 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
313 x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320 size_t inbytesleft, size_t *no_read)
326 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
331 x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339 size_t inbytesleft, size_t *no_read)
343 if (inbytesleft < sizeof(wchar_t))
345 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
351 memcpy (&wch, inp, sizeof(wch));
353 *no_read = sizeof(wch);
359 static unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp,
360 size_t inbytesleft, size_t *no_read)
368 while (inbytesleft > 0)
374 else if (*inp == 0x9e)
378 else if (*inp == 0x9f)
388 if (inbytesleft == 0)
390 cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
618 static size_t yaz_write_advancegreek(yaz_iconv_t cd, unsigned long x,
619 char **outbuf, size_t *outbytesleft,
623 unsigned char *out = (unsigned char*) *outbuf;
624 if (*outbytesleft < 3)
626 cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */
631 case 0x03ac : out[k++]=0x9d; out[k++]=0x81; break;
632 case 0x03ad : out[k++]=0x9d; out[k++]=0x85; break;
633 case 0x03ae : out[k++]=0x9d; out[k++]=0x87; break;
634 case 0x03af : out[k++]=0x9d; out[k++]=0x89; break;
635 case 0x03cc : out[k++]=0x9d; out[k++]=0x8f; break;
636 case 0x03cd : out[k++]=0x9d; out[k++]=0x95; break;
637 case 0x03ce : out[k++]=0x9d; out[k++]=0x99; break;
638 case 0x0390 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x89; break;
639 case 0x03b0 : out[k++]=0x9d; out[k++]=0x9e; out[k++]=0x95; break;
640 case 0x0386 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x81; break;
641 case 0x0388 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x85; break;
642 case 0x0389 : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x87; break;
643 case 0x038a : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x89; break;
644 case 0x038c : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x8f; break;
645 case 0x038e : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x95; break;
646 case 0x038f : out[k++]=0x9d; out[k++]=0x9f; out[k++]=0x99; break;
647 case 0x03ca : out[k++]=0x9e; out[k++]=0x89; break;
648 case 0x03cb : out[k++]=0x9e; out[k++]=0x95; break;
649 case 0x03aa : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x89; break;
650 case 0x03ab : out[k++]=0x9e; out[k++]=0x9f; out[k++]=0x95; break;
651 case 0x0391 : out[k++]=0x9f; out[k++]=0x81; break;
652 case 0x0392 : out[k++]=0x9f; out[k++]=0x82; break;
653 case 0x0393 : out[k++]=0x9f; out[k++]=0x83; break;
654 case 0x0394 : out[k++]=0x9f; out[k++]=0x84; break;
655 case 0x0395 : out[k++]=0x9f; out[k++]=0x85; break;
656 case 0x0396 : out[k++]=0x9f; out[k++]=0x86; break;
657 case 0x0397 : out[k++]=0x9f; out[k++]=0x87; break;
658 case 0x0398 : out[k++]=0x9f; out[k++]=0x88; break;
659 case 0x0399 : out[k++]=0x9f; out[k++]=0x89; break;
660 case 0x039a : out[k++]=0x9f; out[k++]=0x8a; break;
661 case 0x039b : out[k++]=0x9f; out[k++]=0x8b; break;
662 case 0x039c : out[k++]=0x9f; out[k++]=0x8c; break;
663 case 0x039d : out[k++]=0x9f; out[k++]=0x8d; break;
664 case 0x039e : out[k++]=0x9f; out[k++]=0x8e; break;
665 case 0x039f : out[k++]=0x9f; out[k++]=0x8f; break;
666 case 0x03a0 : out[k++]=0x9f; out[k++]=0x90; break;
667 case 0x03a1 : out[k++]=0x9f; out[k++]=0x91; break;
668 case 0x03a3 : out[k++]=0x9f; out[k++]=0x93; break;
669 case 0x03a4 : out[k++]=0x9f; out[k++]=0x94; break;
670 case 0x03a5 : out[k++]=0x9f; out[k++]=0x95; break;
671 case 0x03a6 : out[k++]=0x9f; out[k++]=0x96; break;
672 case 0x03a7 : out[k++]=0x9f; out[k++]=0x97; break;
673 case 0x03a8 : out[k++]=0x9f; out[k++]=0x98; break;
674 case 0x03a9 : out[k++]=0x9f; out[k++]=0x99; break;
675 case 0x03b1 : out[k++]=0x81; break;
676 case 0x03b2 : out[k++]=0x82; break;
677 case 0x03b3 : out[k++]=0x83; break;
678 case 0x03b4 : out[k++]=0x84; break;
679 case 0x03b5 : out[k++]=0x85; break;
680 case 0x03b6 : out[k++]=0x86; break;
681 case 0x03b7 : out[k++]=0x87; break;
682 case 0x03b8 : out[k++]=0x88; break;
683 case 0x03b9 : out[k++]=0x89; break;
684 case 0x03ba : out[k++]=0x8a; break;
685 case 0x03bb : out[k++]=0x8b; break;
686 case 0x03bc : out[k++]=0x8c; break;
687 case 0x03bd : out[k++]=0x8d; break;
688 case 0x03be : out[k++]=0x8e; break;
689 case 0x03bf : out[k++]=0x8f; break;
690 case 0x03c0 : out[k++]=0x90; break;
691 case 0x03c1 : out[k++]=0x91; break;
692 case 0x03c2 : out[k++]=0x92; break;
693 case 0x03c3 : out[k++]=0x93; break;
694 case 0x03c4 : out[k++]=0x94; break;
695 case 0x03c5 : out[k++]=0x95; break;
696 case 0x03c6 : out[k++]=0x96; break;
697 case 0x03c7 : out[k++]=0x96; break;
698 case 0x03c8 : out[k++]=0x98; break;
699 case 0x03c9 : out[k++]=0x99; break;
703 cd->my_errno = YAZ_ICONV_EILSEQ;
715 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
716 size_t inbytesleft, size_t *no_read,
719 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
720 size_t inbytesleft, size_t *no_read)
723 if (cd->comb_offset < cd->comb_size)
725 *no_read = cd->comb_no_read[cd->comb_offset];
726 x = cd->comb_x[cd->comb_offset];
728 /* special case for double-diacritic combining characters,
729 INVERTED BREVE and DOUBLE TILDE.
730 We'll increment the no_read counter by 1, since we want to skip over
731 the processing of the closing ligature character
733 /* this code is no longer necessary.. our handlers code in
734 yaz_marc8_?_conv (generated by charconv.tcl) now returns
735 0 and no_read=1 when a sequence does not match the input.
736 The SECOND HALFs in codetables.xml produces a non-existant
737 entry in the conversion trie.. Hence when met, the input byte is
738 skipped as it should (in yaz_iconv)
741 if (x == 0x0361 || x == 0x0360)
749 for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
753 if (inbytesleft == 0 && cd->comb_size)
755 cd->my_errno = YAZ_ICONV_EINVAL;
760 x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
763 cd->comb_x[cd->comb_size] = x;
764 cd->comb_no_read[cd->comb_size] = *no_read;
766 inbytesleft = inbytesleft - *no_read;
771 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
772 size_t inbytesleft, size_t *no_read)
774 unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
775 if (x && cd->comb_size == 1)
777 /* For MARC8s we try to get a Latin-1 page code out of it */
779 for (i = 0; latin1_comb[i].x1; i++)
780 if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
782 *no_read += cd->comb_no_read[0];
784 x = latin1_comb[i].y;
791 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
792 size_t inbytesleft, size_t *no_read,
796 while(inbytesleft >= 1 && inp[0] == 27)
798 size_t inbytesleft0 = inbytesleft;
801 while(inbytesleft > 0 && strchr("(,$!)-", *inp))
806 if (inbytesleft <= 0)
809 cd->my_errno = YAZ_ICONV_EINVAL;
812 cd->marc8_esc_mode = *inp++;
814 (*no_read) += inbytesleft0 - inbytesleft;
816 if (inbytesleft <= 0)
821 size_t no_read_sub = 0;
824 switch(cd->marc8_esc_mode)
826 case 'B': /* Basic ASCII */
827 case 'E': /* ANSEL */
828 case 's': /* ASCII */
829 x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
831 case 'g': /* Greek */
832 x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
834 case 'b': /* Subscripts */
835 x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
837 case 'p': /* Superscripts */
838 x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
840 case '2': /* Basic Hebrew */
841 x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
843 case 'N': /* Basic Cyrillic */
844 case 'Q': /* Extended Cyrillic */
845 x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
847 case '3': /* Basic Arabic */
848 case '4': /* Extended Arabic */
849 x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
851 case 'S': /* Greek */
852 x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
854 case '1': /* Chinese, Japanese, Korean (EACC) */
855 x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
859 cd->my_errno = YAZ_ICONV_EILSEQ;
862 *no_read += no_read_sub;
867 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
868 char **outbuf, size_t *outbytesleft,
871 return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
874 size_t yaz_write_UTF8_char(unsigned long x,
875 char **outbuf, size_t *outbytesleft,
878 unsigned char *outp = (unsigned char *) *outbuf;
880 if (x <= 0x7f && *outbytesleft >= 1)
882 *outp++ = (unsigned char) x;
885 else if (x <= 0x7ff && *outbytesleft >= 2)
887 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
888 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
889 (*outbytesleft) -= 2;
891 else if (x <= 0xffff && *outbytesleft >= 3)
893 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
894 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
895 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
896 (*outbytesleft) -= 3;
898 else if (x <= 0x1fffff && *outbytesleft >= 4)
900 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
901 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
902 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
903 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
904 (*outbytesleft) -= 4;
906 else if (x <= 0x3ffffff && *outbytesleft >= 5)
908 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
909 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
910 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
911 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
912 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
913 (*outbytesleft) -= 5;
915 else if (*outbytesleft >= 6)
917 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
918 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
919 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
920 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
921 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
922 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
923 (*outbytesleft) -= 6;
927 *error = YAZ_ICONV_E2BIG; /* not room for output */
930 *outbuf = (char *) outp;
935 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
936 char **outbuf, size_t *outbytesleft,
939 /* list of two char unicode sequence that, when combined, are
940 equivalent to single unicode chars that can be represented in
942 Regular iconv on Linux at least does not seem to convert these,
943 but since MARC-8 to UTF-8 generates these composed sequence
944 we get a better chance of a successful MARC-8 -> ISO-8859-1
946 unsigned char *outp = (unsigned char *) *outbuf;
948 if (cd->compose_char)
951 for (i = 0; latin1_comb[i].x1; i++)
952 if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
954 x = latin1_comb[i].y;
957 if (*outbytesleft < 1)
958 { /* no room. Retain compose_char and bail out */
959 cd->my_errno = YAZ_ICONV_E2BIG;
962 if (!latin1_comb[i].x1)
963 { /* not found. Just write compose_char */
964 *outp++ = (unsigned char) cd->compose_char;
966 *outbuf = (char *) outp;
968 /* compose_char used so reset it. x now holds current char */
969 cd->compose_char = 0;
972 if (!last && x > 32 && x < 127 && cd->compose_char == 0)
974 cd->compose_char = x;
977 else if (x > 255 || x < 1)
979 cd->my_errno = YAZ_ICONV_EILSEQ;
982 else if (*outbytesleft < 1)
984 cd->my_errno = YAZ_ICONV_E2BIG;
987 *outp++ = (unsigned char) x;
989 *outbuf = (char *) outp;
994 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
995 char **outbuf, size_t *outbytesleft,
998 unsigned char *outp = (unsigned char *) *outbuf;
999 if (*outbytesleft >= 4)
1001 *outp++ = (unsigned char) (x>>24);
1002 *outp++ = (unsigned char) (x>>16);
1003 *outp++ = (unsigned char) (x>>8);
1004 *outp++ = (unsigned char) x;
1005 (*outbytesleft) -= 4;
1009 cd->my_errno = YAZ_ICONV_E2BIG;
1010 return (size_t)(-1);
1012 *outbuf = (char *) outp;
1016 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
1017 char **outbuf, size_t *outbytesleft,
1020 unsigned char *outp = (unsigned char *) *outbuf;
1021 if (*outbytesleft >= 4)
1023 *outp++ = (unsigned char) x;
1024 *outp++ = (unsigned char) (x>>8);
1025 *outp++ = (unsigned char) (x>>16);
1026 *outp++ = (unsigned char) (x>>24);
1027 (*outbytesleft) -= 4;
1031 cd->my_errno = YAZ_ICONV_E2BIG;
1032 return (size_t)(-1);
1034 *outbuf = (char *) outp;
1038 static unsigned long lookup_marc8(yaz_iconv_t cd,
1039 unsigned long x, int *comb,
1040 const char **page_chr)
1043 char *utf8_outbuf = utf8_buf;
1044 size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
1046 r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
1047 if (r == (size_t)(-1))
1049 cd->my_errno = YAZ_ICONV_EILSEQ;
1055 size_t inbytesleft, no_read_sub = 0;
1058 *utf8_outbuf = '\0';
1059 inp = (unsigned char *) utf8_buf;
1060 inbytesleft = strlen(utf8_buf);
1062 x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
1065 *page_chr = "\033(B";
1068 x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
1071 *page_chr = "\033g";
1074 x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
1077 *page_chr = "\033b";
1080 x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
1083 *page_chr = "\033p";
1086 x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
1089 *page_chr = "\033(2";
1092 x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
1095 *page_chr = "\033(N";
1098 x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
1101 *page_chr = "\033(3";
1104 x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
1107 *page_chr = "\033(S";
1110 x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
1113 *page_chr = "\033$1";
1116 cd->my_errno = YAZ_ICONV_EILSEQ;
1121 static size_t flush_combos(yaz_iconv_t cd,
1122 char **outbuf, size_t *outbytesleft)
1124 unsigned long y = cd->write_marc8_last;
1127 size_t i, out_no = 0;
1132 byte = (unsigned char )((y>>16) & 0xff);
1134 out_buf[out_no++] = byte;
1135 byte = (unsigned char)((y>>8) & 0xff);
1137 out_buf[out_no++] = byte;
1138 byte = (unsigned char )(y & 0xff);
1140 out_buf[out_no++] = byte;
1142 if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
1144 cd->my_errno = YAZ_ICONV_E2BIG;
1145 return (size_t) (-1);
1148 for (i = 0; i < cd->write_marc8_comb_no; i++)
1150 /* all MARC-8 combined characters are simple bytes */
1151 byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
1152 *(*outbuf)++ = byte;
1155 memcpy(*outbuf, out_buf, out_no);
1157 (*outbytesleft) -= out_no;
1158 if (cd->write_marc8_second_half_char)
1160 *(*outbuf)++ = cd->write_marc8_second_half_char;
1164 cd->write_marc8_last = 0;
1165 cd->write_marc8_comb_no = 0;
1166 cd->write_marc8_second_half_char = 0;
1170 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
1171 char **outbuf, size_t *outbytesleft,
1175 const char *page_chr = 0;
1176 unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
1179 return (size_t) (-1);
1184 cd->write_marc8_second_half_char = 0xEC;
1185 else if (x == 0x0360)
1186 cd->write_marc8_second_half_char = 0xFB;
1188 if (cd->write_marc8_comb_no < 6)
1189 cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
1193 size_t r = flush_combos(cd, outbuf, outbytesleft);
1194 const char *old_page_chr = cd->write_marc8_page_chr;
1197 if (strcmp(page_chr, old_page_chr))
1200 const char *page_out = page_chr;
1202 if (*outbytesleft < 8)
1204 cd->my_errno = YAZ_ICONV_E2BIG;
1206 return (size_t) (-1);
1208 cd->write_marc8_page_chr = page_chr;
1210 if (!strcmp(old_page_chr, "\033p")
1211 || !strcmp(old_page_chr, "\033g")
1212 || !strcmp(old_page_chr, "\033b"))
1214 /* Technique 1 leave */
1216 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
1218 /* Must leave script + enter new page */
1219 plen = strlen(page_out);
1220 memcpy(*outbuf, page_out, plen);
1222 (*outbytesleft) -= plen;
1223 page_out = page_chr;
1226 plen = strlen(page_out);
1227 memcpy(*outbuf, page_out, plen);
1229 (*outbytesleft) -= plen;
1231 cd->write_marc8_last = y;
1235 size_t r = flush_combos(cd, outbuf, outbytesleft);
1239 cd->write_marc8_comb_no--;
1241 cd->write_marc8_last = 0;
1248 static size_t yaz_flush_marc8(yaz_iconv_t cd,
1249 char **outbuf, size_t *outbytesleft)
1251 if (strcmp(cd->write_marc8_page_chr, "\033(B"))
1253 if (*outbytesleft < 3)
1255 cd->my_errno = YAZ_ICONV_E2BIG;
1256 return (size_t) (-1);
1258 memcpy(*outbuf, "\033(B", 3);
1265 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
1266 char **outbuf, size_t *outbytesleft,
1270 for (i = 0; latin1_comb[i].x1; i++)
1272 if (x == latin1_comb[i].y)
1275 /* save the output pointers .. */
1276 char *outbuf0 = *outbuf;
1277 size_t outbytesleft0 = *outbytesleft;
1278 int last_ch = cd->write_marc8_last;
1280 r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
1281 outbuf, outbytesleft, 0);
1284 r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
1285 outbuf, outbytesleft, last);
1286 if (r && cd->my_errno == YAZ_ICONV_E2BIG)
1288 /* not enough room. reset output to original values */
1290 *outbytesleft = outbytesleft0;
1291 cd->write_marc8_last = last_ch;
1296 return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
1301 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
1302 char **outbuf, size_t *outbytesleft,
1305 unsigned char *outp = (unsigned char *) *outbuf;
1307 if (*outbytesleft >= sizeof(wchar_t))
1310 memcpy(outp, &wch, sizeof(wch));
1311 outp += sizeof(wch);
1312 (*outbytesleft) -= sizeof(wch);
1316 cd->my_errno = YAZ_ICONV_E2BIG;
1317 return (size_t)(-1);
1319 *outbuf = (char *) outp;
1324 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
1326 return cd->read_handle && cd->write_handle;
1329 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
1331 yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
1333 cd->write_handle = 0;
1334 cd->read_handle = 0;
1335 cd->init_handle = 0;
1336 cd->flush_handle = 0;
1337 cd->my_errno = YAZ_ICONV_UNKNOWN;
1339 /* a useful hack: if fromcode has leading @,
1340 the library not use YAZ's own conversions .. */
1341 if (fromcode[0] == '@')
1345 if (!yaz_matchstr(fromcode, "UTF8"))
1347 cd->read_handle = yaz_read_UTF8;
1348 cd->init_handle = yaz_init_UTF8;
1350 else if (!yaz_matchstr(fromcode, "ISO88591"))
1351 cd->read_handle = yaz_read_ISO8859_1;
1352 else if (!yaz_matchstr(fromcode, "UCS4"))
1353 cd->read_handle = yaz_read_UCS4;
1354 else if (!yaz_matchstr(fromcode, "UCS4LE"))
1355 cd->read_handle = yaz_read_UCS4LE;
1356 else if (!yaz_matchstr(fromcode, "MARC8"))
1357 cd->read_handle = yaz_read_marc8;
1358 else if (!yaz_matchstr(fromcode, "MARC8s"))
1359 cd->read_handle = yaz_read_marc8s;
1360 else if (!yaz_matchstr(fromcode, "advancegreek"))
1361 cd->read_handle = yaz_read_advancegreek;
1363 else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1364 cd->read_handle = yaz_read_wchar_t;
1367 if (!yaz_matchstr(tocode, "UTF8"))
1368 cd->write_handle = yaz_write_UTF8;
1369 else if (!yaz_matchstr(tocode, "ISO88591"))
1370 cd->write_handle = yaz_write_ISO8859_1;
1371 else if (!yaz_matchstr (tocode, "UCS4"))
1372 cd->write_handle = yaz_write_UCS4;
1373 else if (!yaz_matchstr(tocode, "UCS4LE"))
1374 cd->write_handle = yaz_write_UCS4LE;
1375 else if (!yaz_matchstr(tocode, "MARC8"))
1377 cd->write_handle = yaz_write_marc8;
1378 cd->flush_handle = yaz_flush_marc8;
1380 else if (!yaz_matchstr(tocode, "MARC8s"))
1382 cd->write_handle = yaz_write_marc8;
1383 cd->flush_handle = yaz_flush_marc8;
1385 else if (!yaz_matchstr(tocode, "advancegreek"))
1387 cd->write_handle = yaz_write_advancegreek;
1390 else if (!yaz_matchstr(tocode, "WCHAR_T"))
1391 cd->write_handle = yaz_write_wchar_t;
1396 if (!cd->read_handle || !cd->write_handle)
1398 cd->iconv_cd = iconv_open (tocode, fromcode);
1399 if (cd->iconv_cd == (iconv_t) (-1))
1406 if (!cd->read_handle || !cd->write_handle)
1416 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1417 char **outbuf, size_t *outbytesleft)
1426 iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1427 if (r == (size_t)(-1))
1429 switch (yaz_errno())
1432 cd->my_errno = YAZ_ICONV_E2BIG;
1435 cd->my_errno = YAZ_ICONV_EINVAL;
1438 cd->my_errno = YAZ_ICONV_EILSEQ;
1441 cd->my_errno = YAZ_ICONV_UNKNOWN;
1453 cd->my_errno = YAZ_ICONV_UNKNOWN;
1454 cd->marc8_esc_mode = 'B';
1456 cd->comb_offset = cd->comb_size = 0;
1457 cd->compose_char = 0;
1459 cd->write_marc8_comb_no = 0;
1460 cd->write_marc8_second_half_char = 0;
1461 cd->write_marc8_last = 0;
1462 cd->write_marc8_page_chr = "\033(B";
1470 if (cd->init_handle && inbuf && *inbuf)
1473 size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1474 *inbytesleft, &no_read);
1477 if (cd->my_errno == YAZ_ICONV_EINVAL)
1482 *inbytesleft -= no_read;
1496 no_read = cd->no_read_x;
1498 else if (inbuf && *inbuf)
1500 if (*inbytesleft == 0)
1502 r = *inbuf - inbuf0;
1505 x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1516 if (cd->flush_handle && outbuf && *outbuf)
1517 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1524 r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1525 (*inbytesleft - no_read) == 0 ? 1 : 0);
1528 /* unable to write it. save it because read_handle cannot
1530 if (cd->my_errno == YAZ_ICONV_E2BIG)
1533 cd->no_read_x = no_read;
1539 *inbytesleft -= no_read;
1540 (*inbuf) += no_read;
1545 int yaz_iconv_error (yaz_iconv_t cd)
1547 return cd->my_errno;
1550 int yaz_iconv_close (yaz_iconv_t cd)
1554 iconv_close (cd->iconv_cd);
1563 * indent-tabs-mode: nil
1565 * vim: shiftwidth=4 tabstop=8 expandtab