From: Adam Dickmeiss Date: Mon, 17 Mar 2008 14:32:59 +0000 (+0100) Subject: MARC-8 ANSEL fix and proper better handling of incompl. sequences. X-Git-Tag: v3.0.30~81 X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=af3fd923edc18c8ba6a0b629632e4a11ea30e8eb;p=yaz-moved-to-github.git MARC-8 ANSEL fix and proper better handling of incompl. sequences. The routine yaz_read_marc8_comb was modified to handle the escape sequence for ANSEL properly . Thanks to Gary Anderson for pointing out the problem with ANSEL. A few tests were added to test for the use of incomplete sequences. --- diff --git a/src/siconv.c b/src/siconv.c index 17e3d89..1639759 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 1995-2007, Index Data ApS + * Copyright (C) 1995-2008, Index Data ApS * See the file LICENSE for details. * * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $ @@ -1167,41 +1167,48 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, int *comb) { *no_read = 0; - while(inbytesleft >= 1 && inp[0] == 27) + while (inbytesleft > 0 && *inp == 27) { - int ch; + int *modep = &cd->g0_mode; size_t inbytesleft0 = inbytesleft; - inp++; + inbytesleft--; - if (inbytesleft > 0 && *inp == '$') + inp++; + if (inbytesleft == 0) + goto incomplete; + if (*inp == '$') /* set with multiple bytes */ { inbytesleft--; inp++; } - if (inbytesleft <= 1) + if (inbytesleft == 0) + goto incomplete; + if (*inp == '(' || *inp == ',') /* G0 */ { - *no_read = 0; - cd->my_errno = YAZ_ICONV_EINVAL; - return 0; + inbytesleft--; + inp++; } - inbytesleft--; - ch = *inp++; - if (inbytesleft > 0 && (ch == '(' || ch == ',')) + else if (*inp == ')' || *inp == '-') /* G1 */ { inbytesleft--; - cd->g0_mode = *inp++; + inp++; + modep = &cd->g1_mode; } - else if (inbytesleft > 0 && (ch == ')' || ch == '-')) + if (inbytesleft == 0) + goto incomplete; + if (*inp == '!') /* ANSEL is a special case */ { inbytesleft--; - cd->g1_mode = *inp++; + inp++; } - else - cd->g0_mode = ch; + if (inbytesleft == 0) + goto incomplete; + *modep = *inp++; /* Final character */ + inbytesleft--; (*no_read) += inbytesleft0 - inbytesleft; } - if (inbytesleft <= 0) + if (inbytesleft == 0) return 0; else if (*inp == ' ') { @@ -1265,6 +1272,10 @@ static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, *no_read += no_read_sub; return x; } +incomplete: + *no_read = 0; + cd->my_errno = YAZ_ICONV_EINVAL; + return 0; } static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x, diff --git a/test/tsticonv.c b/test/tsticonv.c index 14129db..f31b6d6 100644 --- a/test/tsticonv.c +++ b/test/tsticonv.c @@ -89,9 +89,10 @@ static int tst_convert_l(yaz_iconv_t cd, size_t in_len, const char *in_buf, outbuf - outbuf0, outbuf0); } -static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) +static int tst_convert_x(yaz_iconv_t cd, const char *buf, const char *cmpbuf, + int expect_error) { - int ret = 0; + int ret = 1; WRBUF b = wrbuf_alloc(); char outbuf[12]; size_t inbytesleft = strlen(buf); @@ -108,7 +109,12 @@ static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) { int e = yaz_iconv_error(cd); if (e != YAZ_ICONV_E2BIG) + { + if (expect_error != -1) + if (e != expect_error) + ret = 0; break; + } } else { @@ -116,16 +122,20 @@ static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) char *outp = outbuf; r = yaz_iconv(cd, 0, 0, &outp, &outbytesleft); wrbuf_write(b, outbuf, outp - outbuf); + if (expect_error != -1) + if (expect_error) + ret = 0; break; } } if (wrbuf_len(b) == strlen(cmpbuf) && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b))) - ret = 1; + ; else { WRBUF w = wrbuf_alloc(); + ret = 0; wrbuf_rewind(w); wrbuf_puts_escaped(w, buf); yaz_log(YLOG_LOG, "input %s", wrbuf_cstr(w)); @@ -145,6 +155,10 @@ static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) return ret; } +static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) +{ + return tst_convert_x(cd, buf, cmpbuf, 0); +} /* some test strings in ISO-8859-1 format */ static const char *iso_8859_1_a[] = { @@ -388,19 +402,26 @@ static void tst_marc8_to_utf8(void) if (!cd) return; - /* bug #2115 */ - YAZ_CHECK(tst_convert(cd, ESC "(N" ESC ")Qp" ESC "(B", "\xd0\x9f")); - - YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math")); /* COMBINING ACUTE ACCENT */ YAZ_CHECK(tst_convert(cd, "Cours de mathâe", "Cours de mathe\xcc\x81")); - YAZ_CHECK(tst_convert(cd, "a\xea\x1e", "a\x1e\xcc\x8a")); + YAZ_CHECK(tst_convert(cd, "\xea" "a", "a\xcc\x8a")); + YAZ_CHECK(tst_convert(cd, "a" "\xea" "\x1e", "a" "\x1e\xcc\x8a")); + YAZ_CHECK(tst_convert(cd, "a" "\xea" "p", "a" "p\xcc\x8a")); + + YAZ_CHECK(tst_convert_x(cd, "a\xea", "a", YAZ_ICONV_EINVAL)); + YAZ_CHECK(tst_convert(cd, "p", "\xcc\x8a")); /* note: missing p */ + yaz_iconv(cd, 0, 0, 0, 0); /* incomplete. so we have to reset */ + + /* bug #2115 */ + YAZ_CHECK(tst_convert(cd, ESC "(N" ESC ")Qp" ESC "(B", "\xd0\x9f")); - YAZ_CHECK(tst_convert(cd, "a\xea", "a")); + YAZ_CHECK(tst_convert_x(cd, ESC , "", YAZ_ICONV_EINVAL)); + YAZ_CHECK(tst_convert_x(cd, ESC "(", "", YAZ_ICONV_EINVAL)); + YAZ_CHECK(tst_convert_x(cd, ESC "(B", "", 0)); yaz_iconv_close(cd); }