From: Wolfram Schneider Date: Mon, 17 Nov 2008 14:17:16 +0000 (+0100) Subject: add copy of iconv_decode_marc8.c X-Git-Tag: v3.0.40~64 X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=e6d6af3d3fe9fca9fa78dfc62f5c6887778744ba;p=yaz-moved-to-github.git add copy of iconv_decode_marc8.c --- diff --git a/src/iconv_decode_iso5426.c b/src/iconv_decode_iso5426.c new file mode 100644 index 0000000..eecee04 --- /dev/null +++ b/src/iconv_decode_iso5426.c @@ -0,0 +1,287 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2008 Index Data + * See the file LICENSE for details. + */ +/** + * \file + * \brief MARC-8 decoding + * + * MARC-8 reference: + * http://www.loc.gov/marc/specifications/speccharmarc8.html + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#include +#include "iconv-p.h" + +struct decoder_data { + int g0_mode; + int g1_mode; + + int comb_offset; + int comb_size; + unsigned long comb_x[8]; + size_t comb_no_read[8]; +}; + +yaz_conv_func_t yaz_marc8_42_conv; +yaz_conv_func_t yaz_marc8_45_conv; +yaz_conv_func_t yaz_marc8_67_conv; +yaz_conv_func_t yaz_marc8_62_conv; +yaz_conv_func_t yaz_marc8_70_conv; +yaz_conv_func_t yaz_marc8_32_conv; +yaz_conv_func_t yaz_marc8_4E_conv; +yaz_conv_func_t yaz_marc8_51_conv; +yaz_conv_func_t yaz_marc8_33_conv; +yaz_conv_func_t yaz_marc8_34_conv; +yaz_conv_func_t yaz_marc8_53_conv; +yaz_conv_func_t yaz_marc8_31_conv; + + +static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, + struct decoder_data *data, + unsigned char *inp, + size_t inbytesleft, size_t *no_read, + int *comb); + +static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = (struct decoder_data *) d->data; + unsigned long x; + if (data->comb_offset < data->comb_size) + { + *no_read = data->comb_no_read[data->comb_offset]; + x = data->comb_x[data->comb_offset]; + + /* special case for double-diacritic combining characters, + INVERTED BREVE and DOUBLE TILDE. + We'll increment the no_read counter by 1, since we want to skip over + the processing of the closing ligature character + */ + /* this code is no longer necessary.. our handlers code in + yaz_marc8_?_conv (generated by charconv.tcl) now returns + 0 and no_read=1 when a sequence does not match the input. + The SECOND HALFs in codetables.xml produces a non-existant + entry in the conversion trie.. Hence when met, the input byte is + skipped as it should (in yaz_iconv) + */ +#if 0 + if (x == 0x0361 || x == 0x0360) + *no_read += 1; +#endif + data->comb_offset++; + return x; + } + + data->comb_offset = 0; + for (data->comb_size = 0; data->comb_size < 8; data->comb_size++) + { + int comb = 0; + + if (inbytesleft == 0 && data->comb_size) + { + yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); + x = 0; + *no_read = 0; + break; + } + x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb); + if (!comb || !x) + break; + data->comb_x[data->comb_size] = x; + data->comb_no_read[data->comb_size] = *no_read; + inp += *no_read; + inbytesleft = inbytesleft - *no_read; + } + return x; +} + +static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = (struct decoder_data *) d->data; + unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read); + if (x && data->comb_size == 1) + { + if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x)) + { + *no_read += data->comb_no_read[0]; + data->comb_size = 0; + } + } + return x; +} + +static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, + struct decoder_data *data, + unsigned char *inp, + size_t inbytesleft, size_t *no_read, + int *comb) +{ + *no_read = 0; + while (inbytesleft > 0 && *inp == 27) + { + int *modep = &data->g0_mode; + size_t inbytesleft0 = inbytesleft; + + inbytesleft--; + inp++; + if (inbytesleft == 0) + goto incomplete; + if (*inp == '$') /* set with multiple bytes */ + { + inbytesleft--; + inp++; + } + if (inbytesleft == 0) + goto incomplete; + if (*inp == '(' || *inp == ',') /* G0 */ + { + inbytesleft--; + inp++; + } + else if (*inp == ')' || *inp == '-') /* G1 */ + { + inbytesleft--; + inp++; + modep = &data->g1_mode; + } + if (inbytesleft == 0) + goto incomplete; + if (*inp == '!') /* ANSEL is a special case */ + { + inbytesleft--; + inp++; + } + if (inbytesleft == 0) + goto incomplete; + *modep = *inp++; /* Final character */ + inbytesleft--; + + (*no_read) += inbytesleft0 - inbytesleft; + } + if (inbytesleft == 0) + return 0; + else if (*inp == ' ') + { + *no_read += 1; + return ' '; + } + else + { + unsigned long x; + size_t no_read_sub = 0; + int mode = *inp < 128 ? data->g0_mode : data->g1_mode; + *comb = 0; + + switch(mode) + { + case 'B': /* Basic ASCII */ + case 's': /* ASCII */ + x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'E': /* ANSEL */ + x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); + break; + case 'g': /* Greek */ + x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'b': /* Subscripts */ + x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'p': /* Superscripts */ + x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '2': /* Basic Hebrew */ + x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'N': /* Basic Cyrillic */ + x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'Q': /* Extended Cyrillic */ + x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '3': /* Basic Arabic */ + x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '4': /* Extended Arabic */ + x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'S': /* Greek */ + x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '1': /* Chinese, Japanese, Korean (EACC) */ + x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + default: + *no_read = 0; + yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ); + return 0; + } + *no_read += no_read_sub; + return x; + } +incomplete: + *no_read = 0; + yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); + return 0; +} + + +static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = (struct decoder_data *) d->data; + data->g0_mode = 'B'; + data->g1_mode = 'E'; + data->comb_offset = data->comb_size = 0; + return 0; +} + +void destroy_marc8(yaz_iconv_decoder_t d) +{ + struct decoder_data *data = (struct decoder_data *) d->data; + xfree(data); +} + +yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode, + yaz_iconv_decoder_t d) +{ + if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL")) + d->read_handle = read_marc8; + else if (!yaz_matchstr(fromcode, "ISO5426")) + d->read_handle = read_marc8; + else if (!yaz_matchstr(fromcode, "MARC8s")) + d->read_handle = read_marc8s; + else + return 0; + { + struct decoder_data *data = (struct decoder_data *) + xmalloc(sizeof(*data)); + d->data = data; + d->init_handle = init_marc8; + d->destroy_handle = destroy_marc8; + } + return d; +} + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */