Added turbo marcxml read and write
authorDennis Schafroth <dennis@indexdata.com>
Thu, 4 Mar 2010 16:10:12 +0000 (17:10 +0100)
committerDennis Schafroth <dennis@indexdata.com>
Thu, 4 Mar 2010 16:10:12 +0000 (17:10 +0100)
src/marc_read_xml.c
src/marcdisp.c
src/record_conv.c

index b755c2e..650106c 100644 (file)
@@ -90,6 +90,103 @@ int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
     return 0;
 }
 
+const char *tag_value_extract(const char *name, char tag_buffer[5]) {
+       size_t length = strlen(name);
+       if (length == 3) {
+               strcpy(tag_buffer, name);
+               return tag_buffer;
+       }
+       return 0;
+}
+
+// pattern <on character or -AB[CD]
+const char *code_value_extract(const char *name, char tag_buffer[5]) {
+       size_t length = strlen(name);
+       if (length == 1 ) {
+               return name;
+       }
+       if (length > 2 && length < 5) {
+               if (name[0] != '-') {
+                       return 0;
+               }
+               length--;
+               const char *ptr = name+1;
+               int index = 0;
+               for (index = 0; index < length/2; index++) {
+                       unsigned int value;
+                       char temp[3];
+                       strncpy(temp, ptr + 2*index, 2);
+                       sscanf(temp, "%02X", &value);
+                       tag_buffer[index] = (unsigned char) value;
+               }
+               tag_buffer[index] = '\0';
+               if (index > 0)
+                       return tag_buffer;
+       }
+       return 0;
+}
+
+
+int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr, char indicators[11])
+{
+    NMEM nmem = yaz_marc_get_nmem(mt);
+    for (; ptr; ptr = ptr->next)
+    {
+        if (ptr->type == XML_ELEMENT_NODE)
+        {
+               xmlNode *p;
+               if (!strncmp((const char *) ptr->name, "i", 1)) {
+               int length = strlen(ptr->name+1);
+               if (length > 0) {
+                       int index = (int)strtol(ptr->name+1, (char **)NULL, 10);
+                               for (p = ptr->children; p ; p = p->next)
+                        if (p->type == XML_TEXT_NODE) {
+                            indicators[index] = ((const char *)p->content)[0];
+                            break;
+                        }
+               }
+            }
+            else if (!strncmp((const char *) ptr->name, "s", 1))
+            {
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                               const char *tag_value = code_value_extract((ptr->name+1), buffer);
+                if (!tag_value)
+                {
+                    yaz_marc_cprintf(
+                        mt, "Missing 'code' value for 'subfield'" );
+                    return -1;
+                }
+
+               size_t ctrl_data_len = 0;
+                char *ctrl_data_buf = 0;
+                               ctrl_data_len = strlen((const char *) tag_value);
+                               // Extract (length) from CDATA
+                               xmlNode *p;
+                               for (p = ptr->children; p ; p = p->next)
+                    if (p->type == XML_TEXT_NODE)
+                        ctrl_data_len += strlen((const char *)p->content);
+                               // Allocate memory for code value (1 character (can be multi-byte) and data
+                ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
+                // Build a string with "<Code><data>"
+                strcpy(ctrl_data_buf, (const char *) tag_value);
+                for (p = ptr->children; p ; p = p->next)
+                    if (p->type == XML_TEXT_NODE)
+                        strcat(ctrl_data_buf, (const char *)p->content);
+                yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
+            }
+            else
+            {
+                yaz_marc_cprintf(
+                    mt, "Expected element 'subfield', got '%.80s'", ptr->name);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+
 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
 {
     int indicator_length;
@@ -104,7 +201,8 @@ static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
     for(; ptr; ptr = ptr->next)
         if (ptr->type == XML_ELEMENT_NODE)
         {
-            if (!strcmp((const char *) ptr->name, "leader"))
+               if ( !strcmp( (const char *) ptr->name, "leader") ||
+                       (!strncmp((const char *) ptr->name, "l", 1) ))
             {
                 xmlNode *p = ptr->children;
                 for(; p; p = p->next)
@@ -145,7 +243,7 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
     for(; ptr; ptr = ptr->next)
         if (ptr->type == XML_ELEMENT_NODE)
         {
-            if (!strcmp((const char *) ptr->name, "controlfield"))
+               if (!strcmp( (const char *) ptr->name, "controlfield"))
             {
                 const xmlNode *ptr_tag = 0;
                 struct _xmlAttr *attr;
@@ -215,6 +313,61 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
         }
     return 0;
 }
+
+struct yaz_marc_node* yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, const char *tag_value);
+
+static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
+{
+    for(; ptr; ptr = ptr->next)
+        if (ptr->type == XML_ELEMENT_NODE)
+        {
+               if (!strncmp( (const char *) ptr->name, "c", 1))
+            {
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                       //Extract the tag value out of the rest of the element name
+                       const char *tag_value = tag_value_extract((const char *)(ptr->name+1), buffer);
+                if (!tag_value)
+                {
+                    yaz_marc_cprintf(
+                        mt, "Missing attribute 'tag' for 'controlfield'" );
+                    return -1;
+                }
+                yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
+                //wrbuf_destroy(tag_value);
+            }
+            else if (!strncmp((const char *) ptr->name, "d",1))
+            {
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                               const char *tag_value = tag_value_extract(ptr->name+1, buffer);
+                if (!tag_value)
+                               {
+                    yaz_marc_cprintf(
+                        mt, "Missing attribute 'tag' for 'datafield'" );
+                    return -1;
+                }
+                /* note that indstr[0] is unused so we use indstr[1..] */
+                struct yaz_marc_node *n = yaz_marc_add_datafield_turbo_xml(mt, tag_value);
+
+                int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children, indstr);
+                yaz_marc_datafield_set_indicators(n, indstr+1, strlen(indstr+1));
+                if (rc)
+                    return -1;
+            }
+            else
+            {
+                yaz_marc_cprintf(mt,
+                                 "Expected element controlfield or datafield,"
+                                 " got %.80s", ptr->name);
+                return -1;
+            }
+        }
+    return 0;
+}
+
+
 #endif
 
 #if YAZ_HAVE_XML2
@@ -244,7 +397,14 @@ int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
     ptr = ptr->children;
     if (yaz_marc_read_xml_leader(mt, &ptr))
         return -1;
-    return yaz_marc_read_xml_fields(mt, ptr->next);
+
+    switch (yaz_marc_get_read_format(mt)) {
+               case YAZ_MARC_MARCXML:
+                       return yaz_marc_read_xml_fields(mt, ptr->next);
+               case YAZ_MARC_TMARCXML:
+                       return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
+    }
+       return -1;
 }
 #endif
 
index 10927cd..bd03c00 100644 (file)
@@ -87,7 +87,8 @@ struct yaz_marc_subfield {
 struct yaz_marc_t_ {
     WRBUF m_wr;
     NMEM nmem;
-    int xml;
+    int input_format;
+    int output_format;
     int debug;
     int write_using_libxml2;
     int turbo_format;
@@ -104,7 +105,7 @@ struct yaz_marc_t_ {
 yaz_marc_t yaz_marc_create(void)
 {
     yaz_marc_t mt = (yaz_marc_t) xmalloc(sizeof(*mt));
-    mt->xml = YAZ_MARC_LINE;
+    mt->output_format = YAZ_MARC_LINE;
     mt->debug = 0;
     mt->write_using_libxml2 = 0;
     mt->enable_collection = no_collection;
@@ -245,6 +246,25 @@ void yaz_marc_add_datafield_xml(yaz_marc_t mt, const xmlNode *ptr_tag,
     /* make subfield_pp the current (last one) */
     mt->subfield_pp = &n->u.datafield.subfields;
 }
+
+struct yaz_marc_node* yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value)
+{
+    struct yaz_marc_node *n = yaz_marc_add_node(mt);
+    n->which = YAZ_MARC_DATAFIELD;
+    n->u.datafield.tag = tag_value;
+    n->u.datafield.indicator = 0;
+    n->u.datafield.subfields = 0;
+
+    /* make subfield_pp the current (last one) */
+    mt->subfield_pp = &n->u.datafield.subfields;
+    return n;
+}
+
+void yaz_marc_datafield_set_indicators(struct yaz_marc_node *n, char *indicator)
+{
+    n->u.datafield.indicator = indicator;
+}
+
 #endif
 
 void yaz_marc_add_subfield(yaz_marc_t mt,
@@ -504,7 +524,7 @@ int yaz_marc_write_trailer(yaz_marc_t mt, WRBUF wr)
 {
     if (mt->enable_collection == collection_second)
     {
-        switch(mt->xml)
+        switch(mt->output_format)
         {
         case YAZ_MARC_MARCXML:
             wrbuf_printf(wr, "</collection>\n");
@@ -524,7 +544,7 @@ void yaz_marc_enable_collection(yaz_marc_t mt)
 
 int yaz_marc_write_mode(yaz_marc_t mt, WRBUF wr)
 {
-    switch(mt->xml)
+    switch(mt->output_format)
     {
     case YAZ_MARC_LINE:
         return yaz_marc_write_line(mt, wr);
@@ -699,7 +719,10 @@ int yaz_marc_write_marcxml(yaz_marc_t mt, WRBUF wr)
     /* http://www.loc.gov/marc/bibliographic/ecbdldrd.html#mrcblea */
     if (!mt->leader_spec)
         yaz_marc_modify_leader(mt, 9, "a");
-    return yaz_marc_write_marcxml_ns(mt, wr, "http://www.loc.gov/MARC21/slim",
+    char *name_space = "http://www.loc.gov/MARC21/slim";
+    if (mt->output_format == YAZ_MARC_TMARCXML)
+       name_space = "http://www.indexdata.com/MARC21/turboxml";
+    return yaz_marc_write_marcxml_ns(mt, wr, name_space,
                                      0, 0);
 }
 
@@ -714,7 +737,7 @@ int yaz_marc_write_marcxchange(yaz_marc_t mt, WRBUF wr,
 
 #if YAZ_HAVE_XML2
 
-void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *record_ptr, xmlNsPtr ns_record, WRBUF wr_cdata, int identifier_length)
+void add_marc_datafield_turbo_xml(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *record_ptr, xmlNsPtr ns_record, WRBUF wr_cdata, int identifier_length)
 {
     xmlNode *ptr;
     struct yaz_marc_subfield *s;
@@ -738,10 +761,16 @@ void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *re
             char ind_str[6];
             char ind_val[2];
             
-            sprintf(ind_str, "ind%d", i+1);
             ind_val[0] = n->u.datafield.indicator[i];
             ind_val[1] = '\0';
-            xmlNewProp(ptr, BAD_CAST ind_str, BAD_CAST ind_val);
+            if (!turbo) {
+                sprintf(ind_str, "ind%d", i+1);
+               xmlNewProp(ptr, BAD_CAST ind_str, BAD_CAST ind_val);
+            }
+            else {
+                sprintf(ind_str, "i%d", i+1);
+               xmlNewTextChild(ptr, ns_record, BAD_CAST ind_str, BAD_CAST ind_val);
+            }
         }
     }
        WRBUF subfield_name = wrbuf_alloc();
@@ -772,13 +801,21 @@ void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *re
                                (s->code_data[0] >= 'A' && s->code_data[0] <= 'Z'))
                {
                        wrbuf_iconv_write(subfield_name, mt->iconv_cd,s->code_data, using_code_len);
-               ptr_subfield = xmlNewTextChild(ptr, ns_record,
-                               BAD_CAST wrbuf_cstr(subfield_name),
-                               BAD_CAST wrbuf_cstr(wr_cdata));
                }
-               else
-                       //TODO FIX
-                               yaz_log(YLOG_WARN, "Dropping subfield: %s", s->code_data);
+               else {
+                               char buffer[2*using_code_len + 1];
+                               int index;
+                               for (index = 0; index < using_code_len; index++) {
+                                       sprintf(buffer + 2*index, "%02X", (unsigned char) s->code_data[index] & 0xFF);
+                               };
+                               buffer[2*(index+1)] = 0;
+                               wrbuf_puts(subfield_name, "-");
+                               wrbuf_puts(subfield_name, buffer);
+                       yaz_log(YLOG_WARN, "Using numeric value in element name: %s", buffer);
+               }
+               ptr_subfield = xmlNewTextChild(ptr, ns_record,
+                               BAD_CAST wrbuf_cstr(subfield_name),
+                               BAD_CAST wrbuf_cstr(wr_cdata));
         }
     }
        wrbuf_destroy(subfield_name);
@@ -828,7 +865,7 @@ int yaz_marc_write_turbo_xml(yaz_marc_t mt, xmlNode **root_ptr,
         switch(n->which)
         {
         case YAZ_MARC_DATAFIELD:
-               add_marc_datafield_xml2(mt, n, record_ptr, ns_record, wr_cdata, identifier_length);
+               add_marc_datafield_turbo_xml(mt, n, record_ptr, ns_record, wr_cdata, identifier_length);
             break;
         case YAZ_MARC_CONTROLFIELD:
             wrbuf_rewind(wr_cdata);
@@ -1141,12 +1178,45 @@ int yaz_marc_decode_buf (yaz_marc_t mt, const char *buf, int bsize,
     return r;
 }
 
+void yaz_marc_set_read_format(yaz_marc_t mt, int format)
+{
+    if (mt)
+        mt->input_format = format;
+}
+
+int yaz_marc_get_read_format(yaz_marc_t mt)
+{
+    if (mt)
+        return mt->input_format;
+    return -1;
+}
+
+
+void yaz_marc_set_write_format(yaz_marc_t mt, int format)
+{
+    if (mt)
+        mt->output_format = format;
+}
+
+int yaz_marc_get_write_format(yaz_marc_t mt)
+{
+    if (mt)
+        return mt->output_format;
+    return -1;
+}
+
+
+/**
+ * Deprecated, use yaz_marc_set_write_format
+ */
 void yaz_marc_xml(yaz_marc_t mt, int xmlmode)
 {
     if (mt)
-        mt->xml = xmlmode;
+        mt->output_format = xmlmode;
 }
 
+
+
 void yaz_marc_debug(yaz_marc_t mt, int level)
 {
     if (mt)
@@ -1259,6 +1329,11 @@ void yaz_marc_write_turbo_format(yaz_marc_t mt, int enable)
     mt->turbo_format = enable;
 }
 
+int yaz_marc_is_turbo_format(yaz_marc_t mt)
+{
+    return mt->turbo_format;
+}
+
 
 /*
  * Local variables:
index 6f65797..41e9881 100644 (file)
@@ -472,7 +472,8 @@ static int yaz_record_conv_record_rule(yaz_record_conv_t p,
                 else
                     ret = -1;
             }
-            else if (r->u.marc.input_format == YAZ_MARC_MARCXML)
+            else if (r->u.marc.input_format == YAZ_MARC_MARCXML ||
+                                        r->u.marc.input_format == YAZ_MARC_TMARCXML)
             {
                 xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
                                                wrbuf_len(record));
@@ -483,7 +484,7 @@ static int yaz_record_conv_record_rule(yaz_record_conv_t p,
                 }
                 else
                 {
-                    ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
+                                       ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
                     if (ret)
                         wrbuf_printf(p->wr_error, "yaz_marc_read_xml failed");
                 }