Fixes for MARC-8 in yaz_iconv character set utilies. The MARC-8
authorAdam Dickmeiss <adam@indexdata.dk>
Sat, 7 Aug 2004 08:18:19 +0000 (08:18 +0000)
committerAdam Dickmeiss <adam@indexdata.dk>
Sat, 7 Aug 2004 08:18:19 +0000 (08:18 +0000)
to UTF-8/UCS conversion is now only based on codetables.xml.
Thanks to Larry Dixson for reporting this error.

NEWS
src/Makefile.am
src/siconv.c
test/tsticonv.c
util/Makefile.am
util/marcdump.c
win/makefile
win/yaz.nsi

diff --git a/NEWS b/NEWS
index c578b8d..a00dae8 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,10 @@
 Possible compatibility problems with earlier versions marked with '*'.
 
+Fixes for MARC-8 in yaz_iconv character set utilies. The MARC-8
+to UTF-8/UCS conversion is now only based on codetables.xml.
+
+yaz_marc_decode_buf sets leader pos 9 to "a" for MARCXML output.
+
 --- 2.0.22 2004/08/06
 
 Add support for more "commit changes" in ZOOM (uses Extended Services).
index 8ba3ba7..6a9ebce 100644 (file)
@@ -1,6 +1,6 @@
 ## Copyright (C) 1994-2004, Index Data
 ## All rights reserved.
-## $Id: Makefile.am,v 1.14 2004-08-07 08:06:57 adam Exp $
+## $Id: Makefile.am,v 1.15 2004-08-07 08:18:19 adam Exp $
 
 if ISTHR
 thrlib=libyazthread.la
@@ -20,7 +20,7 @@ illdatadir=$(pkgdatadir)/ill
 illdata_DATA=ill9702.asn item-req.asn ill.tcl
 
 EXTRA_DIST=$(tabdata_DATA) $(illdata_DATA) \
- charconv.tcl codetables.xml charconv.sgm
+ charconv.tcl codetables.xml
 
 YAZCOMP = $(top_srcdir)/util/yaz-asncomp
 YAZCOMPLINE = $(YAZCOMP) -d z.tcl -i yaz -I../include $(YCFLAGS)
@@ -29,8 +29,8 @@ AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS)
 AM_YFLAGS=-p cql_
 THREADED_FLAGS = @CFLAGSTHREADS@
 
-# MARC8 conversion is generated from charconv.sgm + codetables.xml
-marc8.c: charconv.tcl charconv.sgm codetables.xml
+# MARC8 conversion is generated from codetables.xml
+marc8.c: charconv.tcl codetables.xml
        cd $(srcdir); ./charconv.tcl -p marc8 codetables.xml -o marc8.c
 
 libyaz_la_SOURCES=version.c options.c log.c marcdisp.c oid.c wrbuf.c \
index 1eb66c4..7d31a00 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 1997-2004, Index Data
  * See the file LICENSE for details.
  *
- * $Id: siconv.c,v 1.5 2004-03-16 13:12:43 adam Exp $
+ * $Id: siconv.c,v 1.6 2004-08-07 08:18:19 adam Exp $
  */
 
 /* mini iconv and wrapper for system iconv library (if present) */
 
 #include <yaz/yaz-util.h>
 
-unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
-                             size_t *no_read);
+unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
+                             size_t *no_read, int *combining);
 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
-                               size_t *no_read);
+                               size_t *no_read, int *combining);
     
 struct yaz_iconv_struct {
     int my_errno;
@@ -53,6 +53,8 @@ struct yaz_iconv_struct {
     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
                            char **outbuf, size_t *outbytesleft);
     int marc8_esc_mode;
+    int marc8_comb_x;
+    int marc8_comb_no_read;
 #if HAVE_ICONV_H
     iconv_t iconv_cd;
 #endif
@@ -233,6 +235,13 @@ static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
                                      size_t inbytesleft, size_t *no_read)
 {
+    if (cd->marc8_comb_x)
+    {
+       unsigned long x = cd->marc8_comb_x;
+       *no_read = cd->marc8_comb_no_read;
+       cd->marc8_comb_x = 0;
+       return x;
+    }
     *no_read = 0;
     while(inbytesleft >= 1 && inp[0] == 27)
     {
@@ -259,6 +268,7 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
     else
     {
        unsigned long x;
+       int comb = 0;
        size_t no_read_sub = 0;
 
        switch(cd->marc8_esc_mode)
@@ -266,40 +276,59 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
        case 'B':  /* Basic ASCII */
        case 'E':  /* ANSEL */
        case 's':  /* ASCII */
-           x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case 'g':  /* Greek */
-           x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case 'b':  /* Subscripts */
-           x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case 'p':  /* Superscripts */
-           x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case '2':  /* Basic Hebrew */
-           x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case 'N':  /* Basic Cyrillic */
        case 'Q':  /* Extended Cyrillic */
-           x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case '3':  /* Basic Arabic */
        case '4':  /* Extended Arabic */
-           x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case 'S':  /* Greek */
-           x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        case '1':  /* Chinese, Japanese, Korean (EACC) */
-           x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub);
+           x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
            break;
        default:
            *no_read = 0;
            cd->my_errno = YAZ_ICONV_EILSEQ;
            return 0;
        }
+#if 0
+       printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
+#endif
        *no_read += no_read_sub;
+
+       if (comb && cd->marc8_comb_x == 0)
+       {
+           size_t tmp_read = 0;
+           unsigned long next_x;
+
+           /* read next char .. */
+           next_x = yaz_read_marc8(cd, inp + *no_read,
+                                   inbytesleft - *no_read, &tmp_read);
+           /* save this x for later .. */
+           cd->marc8_comb_x = x;
+           /* save next read for later .. */
+           cd->marc8_comb_no_read = tmp_read;
+           /* return next x - thereby swap */
+           x = next_x;
+       }
        return x;
     }
 }
@@ -465,6 +494,7 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
     cd->init_handle = 0;
     cd->my_errno = YAZ_ICONV_UNKNOWN;
     cd->marc8_esc_mode = 'B';
+    cd->marc8_comb_x = 0;
 
     /* a useful hack: if fromcode has leading @,
        the library not use YAZ's own conversions .. */
index 1799002..15c869d 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2002-2004, Index Data
  * See the file LICENSE for details.
  *
- * $Id: tsticonv.c,v 1.2 2004-03-15 21:39:06 adam Exp $
+ * $Id: tsticonv.c,v 1.3 2004-08-07 08:18:19 adam Exp $
  */
 
 #if HAVE_CONFIG_H
@@ -20,9 +20,6 @@ static const char *iso_8859_1_a[] = {
     "ax" ,
     "\330",
     "eneb\346r",
-    "\xfc",
-    "\xfb",
-    "\xfbr",
     0 };
 
 /* same test strings in MARC-8 format */
@@ -30,9 +27,6 @@ static const char *marc8_a[] = {
     "ax",   
     "\xa2",          /* latin capital letter o with stroke */
     "eneb\xb5r",     /* latin small letter ae */
-    "\xe8\x75",      /* latin small letter u with umlaut */
-    "\xe3\x75",      /* latin small letter u with circumflex */
-    "\xe3\x75r",     /* latin small letter u with circumflex */
     0
 };
 
@@ -79,16 +73,25 @@ static void marc8_tst_a()
 static void marc8_tst_b()
 {
     static const char *marc8_b[] = {
+       /* 0 */ 
        "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o",
+       /* 1 */ 
        "\033$1" "\x6F\x77\x29" /* AE0E */ "\x6F\x52\x7C" /* c0F4 */ "\033(B",
+       /* 2 */ 
        "\033$1"
-       "\x21\x50\x6E"  /* 7CFB */
-       "\x21\x51\x31"  /* 7D71 */
-       "\x21\x3A\x67"  /* 5B89 */
-       "\x21\x33\x22"  /* 5168 */
-       "\x21\x33\x53"  /* 5206 */
-       "\x21\x44\x2B"  /* 6790 */
+       "\x21\x50\x6E"  /* UCS 7CFB */
+       "\x21\x51\x31"  /* UCS 7D71 */
+       "\x21\x3A\x67"  /* UCS 5B89 */
+       "\x21\x33\x22"  /* UCS 5168 */
+       "\x21\x33\x53"  /* UCS 5206 */
+       "\x21\x44\x2B"  /* UCS 6790 */
        "\033(B",
+       /* 3 */
+       "\xB0\xB2",     /* AYN and oSLASH */
+       /* 4 */
+       "\xF6\x61",     /* a underscore */
+       /* 5 */
+       "\x61\xC2",     /* a, phonorecord mark */
        0
     };
     static const char *ucs4_b[] = {
@@ -100,6 +103,9 @@ static void marc8_tst_b()
        "\x00\x00\x51\x68"
        "\x00\x00\x52\x06"
        "\x00\x00\x67\x90",
+       "\x00\x00\x02\xBB"  "\x00\x00\x00\xF8",
+       "\x00\x00\x00\x61"  "\x00\x00\x03\x32",
+       "\x00\x00\x00\x61"  "\x00\x00\x21\x17",
        0
     };
     int i;
@@ -115,7 +121,7 @@ static void marc8_tst_b()
     {
         size_t r;
        size_t len;
-       size_t expect_len = (i == 2 ? 24 : 8);
+       size_t expect_len = i == 2 ? 24 : 8;
         char *inbuf= (char*) marc8_b[i];
         size_t inbytesleft = strlen(inbuf);
         char outbuf0[24];
@@ -257,6 +263,7 @@ static void dconvert(int mandatory, const char *tmpcode)
        
 int main (int argc, char **argv)
 {
+    yaz_log_init_file("tsticonv.log");
     dconvert(1, "UTF-8");
     dconvert(1, "ISO-8859-1");
     dconvert(1, "UCS4");
index af3919f..eca8ee3 100644 (file)
@@ -1,8 +1,6 @@
 ## Copyright (C) 1994-2004, Index Data
 ## All rights reserved.
-## $Id: Makefile.am,v 1.27 2004-05-01 23:32:20 adam Exp $
-
-TESTS = $(check_PROGRAMS)
+## $Id: Makefile.am,v 1.28 2004-08-07 08:18:20 adam Exp $
 
 bin_SCRIPTS = yaz-asncomp yaz-config
 
@@ -10,7 +8,7 @@ EXTRA_DIST = yaz-asncomp
 
 DISTCLEANFILES = yaz-config
 
-AM_CPPFLAGS=-I$(top_srcdir)/include
+AM_CPPFLAGS=-I$(top_srcdir)/include $(XML2_CFLAGS)
 
 bin_PROGRAMS = yaz-marcdump yaz-iconv
 noinst_PROGRAMS = cclsh cql2pqf cql2xcql srwtst yaz-benchmark
index 3e97bad..d441855 100644 (file)
@@ -2,17 +2,27 @@
  * Copyright (c) 1995-2004, Index Data
  * See the file LICENSE for details.
  *
- * $Id: marcdump.c,v 1.24 2004-08-04 09:30:30 adam Exp $
+ * $Id: marcdump.c,v 1.25 2004-08-07 08:18:20 adam Exp $
  */
 
 #if HAVE_CONFIG_H
 #include <config.h>
 #endif
 
+#if HAVE_XML2
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+
+#include <libxml/xpath.h>
+#include <libxml/xpathInternals.h>
+
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <assert.h>
 
 #if HAVE_LOCALE_H
 #include <locale.h>
@@ -39,9 +49,60 @@ static void usage(const char *prog)
              prog);
 } 
 
+#if HAVE_XML2
+void print_xpath_nodes(xmlNodeSetPtr nodes, FILE* output) {
+    xmlNodePtr cur;
+    int size;
+    int i;
+    
+    assert(output);
+    size = (nodes) ? nodes->nodeNr : 0;
+    
+    fprintf(output, "Result (%d nodes):\n", size);
+    for(i = 0; i < size; ++i) {
+       assert(nodes->nodeTab[i]);
+       
+       if(nodes->nodeTab[i]->type == XML_NAMESPACE_DECL)
+       {
+           xmlNsPtr ns;
+           
+           ns = (xmlNsPtr)nodes->nodeTab[i];
+           cur = (xmlNodePtr)ns->next;
+           if(cur->ns) { 
+               fprintf(output, "= namespace \"%s\"=\"%s\" for node %s:%s\n", 
+                   ns->prefix, ns->href, cur->ns->href, cur->name);
+           } else {
+               fprintf(output, "= namespace \"%s\"=\"%s\" for node %s\n", 
+                   ns->prefix, ns->href, cur->name);
+           }
+       } 
+       else if(nodes->nodeTab[i]->type == XML_ELEMENT_NODE)
+       {
+           cur = nodes->nodeTab[i];        
+           if(cur->ns) { 
+               fprintf(output, "= element node \"%s:%s\"\n", 
+                   cur->ns->href, cur->name);
+           } 
+           else
+           {
+               fprintf(output, "= element node \"%s\"\n", 
+                   cur->name);
+           }
+       }
+       else
+       {
+           cur = nodes->nodeTab[i];    
+           fprintf(output, "= node \"%s\": type %d\n", cur->name, cur->type);
+       }
+    }
+}
+#endif
+
 int main (int argc, char **argv)
 {
     int r;
+    int libxml_dom_test = 0;
+    int print_offset = 0;
     char *arg;
     int verbose = 0;
     FILE *inf;
@@ -51,7 +112,7 @@ int main (int argc, char **argv)
     int xml = 0;
     FILE *cfile = 0;
     char *from = 0, *to = 0;
-
+    int num = 1;
     
 #if HAVE_LOCALE_H
     setlocale(LC_CTYPE, "");
@@ -62,7 +123,7 @@ int main (int argc, char **argv)
 #endif
 #endif
 
-    while ((r = options("vc:xOXIf:t:", argv, argc, &arg)) != -2)
+    while ((r = options("pvc:xOXIf:t:2", argv, argc, &arg)) != -2)
     {
        int count;
        no++;
@@ -91,6 +152,12 @@ int main (int argc, char **argv)
        case 'I':
            xml = YAZ_MARC_ISO2709;
            break;
+       case 'p':
+           print_offset = 1;
+           break;
+       case '2':
+           libxml_dom_test = 1;
+           break;
         case 0:
            inf = fopen (arg, "rb");
            count = 0;
@@ -128,7 +195,16 @@ int main (int argc, char **argv)
                     
                     r = fread (buf, 1, 5, inf);
                     if (r < 5)
+                   {
+                       if (r && print_offset)
+                           printf ("Extra %d bytes", r);
                         break;
+                   }
+                   if (print_offset)
+                   {
+                       long off = ftell(inf);
+                       printf ("Record %d offset %ld\n", num, (long) off);
+                   }
                     len = atoi_n(buf, 5);
                     if (len < 25 || len > 100000)
                         break;
@@ -140,6 +216,43 @@ int main (int argc, char **argv)
                     if (r <= 0)
                         break;
                    fwrite (result, rlen, 1, stdout);
+#if HAVE_XML2
+                   if (libxml_dom_test)
+                   {
+                       xmlDocPtr doc = xmlParseMemory(result, rlen);
+                       if (!doc)
+                           fprintf(stderr, "xmLParseMemory failed\n");
+                       else
+                       {
+                           int i;
+                           xmlXPathContextPtr xpathCtx; 
+                           xmlXPathObjectPtr xpathObj; 
+                           static const char *xpathExpr[] = {
+                               "/record/datafield[@tag='245']/subfield[@code='a']",
+                               "/record/datafield[@tag='100']/subfield",
+                               "/record/datafield[@tag='245']/subfield[@code='a']",
+                               "/record/datafield[@tag='650']/subfield",
+                               "/record/datafield[@tag='650']",
+                               0};
+                           
+                           xpathCtx = xmlXPathNewContext(doc);
+
+                           for (i = 0; xpathExpr[i]; i++) {
+                               xpathObj = xmlXPathEvalExpression(xpathExpr[i], xpathCtx);
+                               if(xpathObj == NULL) {
+                                   fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", xpathExpr[i]);
+                               }
+                               else
+                               {
+                                   print_xpath_nodes(xpathObj->nodesetval, stdout);
+                                   xmlXPathFreeObject(xpathObj);
+                               }
+                           }
+                           xmlXPathFreeContext(xpathCtx); 
+                           xmlFreeDoc(doc);
+                       }
+                   }
+#endif
                     if (cfile)
                     {
                         char *p = buf;
@@ -159,6 +272,7 @@ int main (int argc, char **argv)
                        }
                         fprintf (cfile, "\"\n");
                     }
+                   num++;
                 }
                 count++;
                 if (cd)
index e8cf92a..40868dd 100644 (file)
@@ -1,6 +1,6 @@
 # Copyright (C) 1994-2004, Index Data
 # All rights reserved.
-# $Id: makefile,v 1.78 2004-05-10 11:56:33 adam Exp $
+# $Id: makefile,v 1.79 2004-08-07 08:18:20 adam Exp $
 #
 # Programmed by
 #  HL: Heikki Levanto, Index Data
@@ -620,9 +620,9 @@ $(ITEM_REQ_FILES): $(SRCDIR)\item-req.asn
        $(TCL) $(TCLOPT) -d ill.tcl item-req.asn
        @cd $(WINDIR)
 
-$(SRCDIR)\marc8.c: $(SRCDIR)\charconv.sgm $(SRCDIR)\codetables.xml $(SRCDIR)\charconv.tcl
+$(SRCDIR)\marc8.c: $(SRCDIR)\codetables.xml $(SRCDIR)\charconv.tcl
        @cd $(SRCDIR)
-       $(TCL) charconv.tcl -O 1 -p marc8 charconv.sgm codetables.xml -o marc8.c
+       $(TCL) charconv.tcl -p marc8 codetables.xml -o marc8.c
 
 !endif
 
index 0fd503a..591fd7f 100644 (file)
@@ -1,4 +1,4 @@
-; $Id: yaz.nsi,v 1.49 2004-08-06 08:31:03 adam Exp $
+; $Id: yaz.nsi,v 1.50 2004-08-07 08:18:20 adam Exp $
 
 !define VERSION "2.0.22"
 
@@ -132,7 +132,6 @@ Section "YAZ Source" YAZ_Source
        File ..\src\*.y
        File ..\src\*.tcl
        File ..\src\*.asn
-       File ..\src\charconv.sgm
        File ..\src\codetables.xml
        SetOutPath $INSTDIR\zoom
        File ..\zoom\*.c