X-Git-Url: http://lists.indexdata.dk/cgi-bin?a=blobdiff_plain;f=src%2Fsiconv.c;h=525574843cb8c572dc5f7d5b251049c5ddcbb1bd;hb=6d61fec3c06d8ed1d648ec7da417a1a8aaa52691;hp=bdb20ee374efdc4619aecbe0bfb06870dbe999b9;hpb=cd6aeaa68dceee3c268dbb354fd32aadbc9fc942;p=yaz-moved-to-github.git diff --git a/src/siconv.c b/src/siconv.c index bdb20ee..5255748 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.40 2007-05-03 22:20:45 adam Exp $ + * $Id: siconv.c,v 1.44 2007-09-22 18:49:55 adam Exp $ */ /** * \file siconv.c @@ -36,44 +36,56 @@ #include -unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_42_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_45_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_67_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_62_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_70_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_32_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_4E_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_51_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft, +unsigned long yaz_marc8_33_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_34_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_53_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8_31_conv(unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining); -unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); -unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft, - size_t *no_read, int *combining); +unsigned long yaz_marc8r_42_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_45_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_67_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_62_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_70_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_32_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_4E_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_51_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_33_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_34_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_53_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); +unsigned long yaz_marc8r_31_conv(unsigned char *inp, size_t inbytesleft, + size_t *no_read, int *combining); struct yaz_iconv_struct { int my_errno; @@ -212,6 +224,7 @@ unsigned long yaz_read_UTF8_char(unsigned char *inp, { unsigned long x = 0; + *no_read = 0; /* by default */ if (inp[0] <= 0x7f) { x = inp[0]; @@ -219,75 +232,86 @@ unsigned long yaz_read_UTF8_char(unsigned char *inp, } else if (inp[0] <= 0xbf || inp[0] >= 0xfe) { - *no_read = 0; *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xdf && inbytesleft >= 2) { - x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f); - if (x >= 0x80) - *no_read = 2; - else + if ((inp[1] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f); + if (x >= 0x80) + *no_read = 2; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xef && inbytesleft >= 3) { - x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | - (inp[2] & 0x3f); - if (x >= 0x800) - *no_read = 3; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | + (inp[2] & 0x3f); + if (x >= 0x800) + *no_read = 3; + else + *error = YAZ_ICONV_EILSEQ; } - } + else + *error = YAZ_ICONV_EILSEQ; + } else if (inp[0] <= 0xf7 && inbytesleft >= 4) { - x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) | - ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f); - if (x >= 0x10000) - *no_read = 4; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) | + ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f); + if (x >= 0x10000) + *no_read = 4; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xfb && inbytesleft >= 5) { - x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) | - ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) | - (inp[4] & 0x3f); - if (x >= 0x200000) - *no_read = 5; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) | + ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) | + (inp[4] & 0x3f); + if (x >= 0x200000) + *no_read = 5; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xfd && inbytesleft >= 6) { - x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) | - ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) | - ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f); - if (x >= 0x4000000) - *no_read = 6; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80 + && (inp[5] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) | + ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) | + ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f); + if (x >= 0x4000000) + *no_read = 6; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else - { - *no_read = 0; - *error = YAZ_ICONV_EINVAL; - } + *error = YAZ_ICONV_EINVAL; /* incomplete sentence */ + return x; } @@ -1158,6 +1182,11 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, } if (inbytesleft <= 0) return 0; + else if (*inp == ' ') + { + *no_read += 1; + return ' '; + } else { unsigned long x; @@ -1167,35 +1196,44 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, switch(cd->marc8_esc_mode) { case 'B': /* Basic ASCII */ - case 'E': /* ANSEL */ case 's': /* ASCII */ - x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb); + case 'E': /* ANSEL */ + x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb); + if (!x) + { + no_read_sub = 0; + x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb); + } break; case 'g': /* Greek */ - x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb); break; case 'b': /* Subscripts */ - x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb); break; case 'p': /* Superscripts */ - x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb); break; case '2': /* Basic Hebrew */ - x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb); break; case 'N': /* Basic Cyrillic */ + x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb); + break; case 'Q': /* Extended Cyrillic */ - x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb); break; case '3': /* Basic Arabic */ + x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb); + break; case '4': /* Extended Arabic */ - x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb); break; case 'S': /* Greek */ - x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb); break; case '1': /* Chinese, Japanese, Korean (EACC) */ - x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb); break; default: *no_read = 0; @@ -1414,56 +1452,77 @@ static unsigned long lookup_marc8(yaz_iconv_t cd, *utf8_outbuf = '\0'; inp = (unsigned char *) utf8_buf; inbytesleft = strlen(utf8_buf); + + if (x == ' ') + return x; - x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(B"; + return x; + } + x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033(B"; return x; } - x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_67_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033g"; return x; } - x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033b"; return x; } - x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033p"; return x; } - x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033(2"; return x; } - x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033(N"; return x; } - x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(Q"; + return x; + } + x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033(3"; return x; } - x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb); + if (x) + { + *page_chr = "\033(4"; + return x; + } + x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033(S"; return x; } - x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb); + x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb); if (x) { *page_chr = "\033$1"; @@ -1592,9 +1651,12 @@ static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x, if (r) return r; - r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr); - if (r) - return r; + if (page_chr) + { + r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, page_chr); + if (r) + return r; + } cd->write_marc8_last = y; } return 0; @@ -1706,6 +1768,8 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->read_handle = yaz_read_advancegreek; else if (!yaz_matchstr(fromcode, "iso54281984")) cd->read_handle = yaz_read_iso5428_1984; + else if (!yaz_matchstr(fromcode, "iso5428:1984")) + cd->read_handle = yaz_read_iso5428_1984; #if HAVE_WCHAR_H else if (!yaz_matchstr(fromcode, "WCHAR_T")) cd->read_handle = yaz_read_wchar_t; @@ -1740,6 +1804,10 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) { cd->write_handle = yaz_write_iso5428_1984; } + else if (!yaz_matchstr(tocode, "iso5428:1984")) + { + cd->write_handle = yaz_write_iso5428_1984; + } #if HAVE_WCHAR_H else if (!yaz_matchstr(tocode, "WCHAR_T")) cd->write_handle = yaz_write_wchar_t;