From 77f27a99f17bdd5e6fc4d8a202ecc6da4ef95660 Mon Sep 17 00:00:00 2001 From: Dennis Schafroth Date: Thu, 4 Mar 2010 17:10:12 +0100 Subject: [PATCH] Added turbo marcxml read and write --- src/marc_read_xml.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++- src/marcdisp.c | 107 ++++++++++++++++++++++++++++----- src/record_conv.c | 5 +- 3 files changed, 257 insertions(+), 21 deletions(-) diff --git a/src/marc_read_xml.c b/src/marc_read_xml.c index b755c2e..650106c 100644 --- a/src/marc_read_xml.c +++ b/src/marc_read_xml.c @@ -90,6 +90,103 @@ int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr) return 0; } +const char *tag_value_extract(const char *name, char tag_buffer[5]) { + size_t length = strlen(name); + if (length == 3) { + strcpy(tag_buffer, name); + return tag_buffer; + } + return 0; +} + +// pattern 2 && length < 5) { + if (name[0] != '-') { + return 0; + } + length--; + const char *ptr = name+1; + int index = 0; + for (index = 0; index < length/2; index++) { + unsigned int value; + char temp[3]; + strncpy(temp, ptr + 2*index, 2); + sscanf(temp, "%02X", &value); + tag_buffer[index] = (unsigned char) value; + } + tag_buffer[index] = '\0'; + if (index > 0) + return tag_buffer; + } + return 0; +} + + +int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr, char indicators[11]) +{ + NMEM nmem = yaz_marc_get_nmem(mt); + for (; ptr; ptr = ptr->next) + { + if (ptr->type == XML_ELEMENT_NODE) + { + xmlNode *p; + if (!strncmp((const char *) ptr->name, "i", 1)) { + int length = strlen(ptr->name+1); + if (length > 0) { + int index = (int)strtol(ptr->name+1, (char **)NULL, 10); + for (p = ptr->children; p ; p = p->next) + if (p->type == XML_TEXT_NODE) { + indicators[index] = ((const char *)p->content)[0]; + break; + } + } + } + else if (!strncmp((const char *) ptr->name, "s", 1)) + { + NMEM nmem = yaz_marc_get_nmem(mt); + char *buffer = (char *) nmem_malloc(nmem, 5); + const char *tag_value = code_value_extract((ptr->name+1), buffer); + if (!tag_value) + { + yaz_marc_cprintf( + mt, "Missing 'code' value for 'subfield'" ); + return -1; + } + + size_t ctrl_data_len = 0; + char *ctrl_data_buf = 0; + ctrl_data_len = strlen((const char *) tag_value); + // Extract (length) from CDATA + xmlNode *p; + for (p = ptr->children; p ; p = p->next) + if (p->type == XML_TEXT_NODE) + ctrl_data_len += strlen((const char *)p->content); + // Allocate memory for code value (1 character (can be multi-byte) and data + ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1); + // Build a string with "" + strcpy(ctrl_data_buf, (const char *) tag_value); + for (p = ptr->children; p ; p = p->next) + if (p->type == XML_TEXT_NODE) + strcat(ctrl_data_buf, (const char *)p->content); + yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len); + } + else + { + yaz_marc_cprintf( + mt, "Expected element 'subfield', got '%.80s'", ptr->name); + return -1; + } + } + } + return 0; +} + + static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p) { int indicator_length; @@ -104,7 +201,8 @@ static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p) for(; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE) { - if (!strcmp((const char *) ptr->name, "leader")) + if ( !strcmp( (const char *) ptr->name, "leader") || + (!strncmp((const char *) ptr->name, "l", 1) )) { xmlNode *p = ptr->children; for(; p; p = p->next) @@ -145,7 +243,7 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr) for(; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE) { - if (!strcmp((const char *) ptr->name, "controlfield")) + if (!strcmp( (const char *) ptr->name, "controlfield")) { const xmlNode *ptr_tag = 0; struct _xmlAttr *attr; @@ -215,6 +313,61 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr) } return 0; } + +struct yaz_marc_node* yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, const char *tag_value); + +static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr) +{ + for(; ptr; ptr = ptr->next) + if (ptr->type == XML_ELEMENT_NODE) + { + if (!strncmp( (const char *) ptr->name, "c", 1)) + { + NMEM nmem = yaz_marc_get_nmem(mt); + char *buffer = (char *) nmem_malloc(nmem, 5); + //Extract the tag value out of the rest of the element name + const char *tag_value = tag_value_extract((const char *)(ptr->name+1), buffer); + if (!tag_value) + { + yaz_marc_cprintf( + mt, "Missing attribute 'tag' for 'controlfield'" ); + return -1; + } + yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children); + //wrbuf_destroy(tag_value); + } + else if (!strncmp((const char *) ptr->name, "d",1)) + { + NMEM nmem = yaz_marc_get_nmem(mt); + char *indstr = nmem_malloc(nmem, 11); /* 0(unused), 1,....9, + zero term */ + char *buffer = (char *) nmem_malloc(nmem, 5); + const char *tag_value = tag_value_extract(ptr->name+1, buffer); + if (!tag_value) + { + yaz_marc_cprintf( + mt, "Missing attribute 'tag' for 'datafield'" ); + return -1; + } + /* note that indstr[0] is unused so we use indstr[1..] */ + struct yaz_marc_node *n = yaz_marc_add_datafield_turbo_xml(mt, tag_value); + + int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children, indstr); + yaz_marc_datafield_set_indicators(n, indstr+1, strlen(indstr+1)); + if (rc) + return -1; + } + else + { + yaz_marc_cprintf(mt, + "Expected element controlfield or datafield," + " got %.80s", ptr->name); + return -1; + } + } + return 0; +} + + #endif #if YAZ_HAVE_XML2 @@ -244,7 +397,14 @@ int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr) ptr = ptr->children; if (yaz_marc_read_xml_leader(mt, &ptr)) return -1; - return yaz_marc_read_xml_fields(mt, ptr->next); + + switch (yaz_marc_get_read_format(mt)) { + case YAZ_MARC_MARCXML: + return yaz_marc_read_xml_fields(mt, ptr->next); + case YAZ_MARC_TMARCXML: + return yaz_marc_read_turbo_xml_fields(mt, ptr->next); + } + return -1; } #endif diff --git a/src/marcdisp.c b/src/marcdisp.c index 10927cd..bd03c00 100644 --- a/src/marcdisp.c +++ b/src/marcdisp.c @@ -87,7 +87,8 @@ struct yaz_marc_subfield { struct yaz_marc_t_ { WRBUF m_wr; NMEM nmem; - int xml; + int input_format; + int output_format; int debug; int write_using_libxml2; int turbo_format; @@ -104,7 +105,7 @@ struct yaz_marc_t_ { yaz_marc_t yaz_marc_create(void) { yaz_marc_t mt = (yaz_marc_t) xmalloc(sizeof(*mt)); - mt->xml = YAZ_MARC_LINE; + mt->output_format = YAZ_MARC_LINE; mt->debug = 0; mt->write_using_libxml2 = 0; mt->enable_collection = no_collection; @@ -245,6 +246,25 @@ void yaz_marc_add_datafield_xml(yaz_marc_t mt, const xmlNode *ptr_tag, /* make subfield_pp the current (last one) */ mt->subfield_pp = &n->u.datafield.subfields; } + +struct yaz_marc_node* yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value) +{ + struct yaz_marc_node *n = yaz_marc_add_node(mt); + n->which = YAZ_MARC_DATAFIELD; + n->u.datafield.tag = tag_value; + n->u.datafield.indicator = 0; + n->u.datafield.subfields = 0; + + /* make subfield_pp the current (last one) */ + mt->subfield_pp = &n->u.datafield.subfields; + return n; +} + +void yaz_marc_datafield_set_indicators(struct yaz_marc_node *n, char *indicator) +{ + n->u.datafield.indicator = indicator; +} + #endif void yaz_marc_add_subfield(yaz_marc_t mt, @@ -504,7 +524,7 @@ int yaz_marc_write_trailer(yaz_marc_t mt, WRBUF wr) { if (mt->enable_collection == collection_second) { - switch(mt->xml) + switch(mt->output_format) { case YAZ_MARC_MARCXML: wrbuf_printf(wr, "\n"); @@ -524,7 +544,7 @@ void yaz_marc_enable_collection(yaz_marc_t mt) int yaz_marc_write_mode(yaz_marc_t mt, WRBUF wr) { - switch(mt->xml) + switch(mt->output_format) { case YAZ_MARC_LINE: return yaz_marc_write_line(mt, wr); @@ -699,7 +719,10 @@ int yaz_marc_write_marcxml(yaz_marc_t mt, WRBUF wr) /* http://www.loc.gov/marc/bibliographic/ecbdldrd.html#mrcblea */ if (!mt->leader_spec) yaz_marc_modify_leader(mt, 9, "a"); - return yaz_marc_write_marcxml_ns(mt, wr, "http://www.loc.gov/MARC21/slim", + char *name_space = "http://www.loc.gov/MARC21/slim"; + if (mt->output_format == YAZ_MARC_TMARCXML) + name_space = "http://www.indexdata.com/MARC21/turboxml"; + return yaz_marc_write_marcxml_ns(mt, wr, name_space, 0, 0); } @@ -714,7 +737,7 @@ int yaz_marc_write_marcxchange(yaz_marc_t mt, WRBUF wr, #if YAZ_HAVE_XML2 -void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *record_ptr, xmlNsPtr ns_record, WRBUF wr_cdata, int identifier_length) +void add_marc_datafield_turbo_xml(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *record_ptr, xmlNsPtr ns_record, WRBUF wr_cdata, int identifier_length) { xmlNode *ptr; struct yaz_marc_subfield *s; @@ -738,10 +761,16 @@ void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *re char ind_str[6]; char ind_val[2]; - sprintf(ind_str, "ind%d", i+1); ind_val[0] = n->u.datafield.indicator[i]; ind_val[1] = '\0'; - xmlNewProp(ptr, BAD_CAST ind_str, BAD_CAST ind_val); + if (!turbo) { + sprintf(ind_str, "ind%d", i+1); + xmlNewProp(ptr, BAD_CAST ind_str, BAD_CAST ind_val); + } + else { + sprintf(ind_str, "i%d", i+1); + xmlNewTextChild(ptr, ns_record, BAD_CAST ind_str, BAD_CAST ind_val); + } } } WRBUF subfield_name = wrbuf_alloc(); @@ -772,13 +801,21 @@ void add_marc_datafield_xml2(yaz_marc_t mt, struct yaz_marc_node *n, xmlNode *re (s->code_data[0] >= 'A' && s->code_data[0] <= 'Z')) { wrbuf_iconv_write(subfield_name, mt->iconv_cd,s->code_data, using_code_len); - ptr_subfield = xmlNewTextChild(ptr, ns_record, - BAD_CAST wrbuf_cstr(subfield_name), - BAD_CAST wrbuf_cstr(wr_cdata)); } - else - //TODO FIX - yaz_log(YLOG_WARN, "Dropping subfield: %s", s->code_data); + else { + char buffer[2*using_code_len + 1]; + int index; + for (index = 0; index < using_code_len; index++) { + sprintf(buffer + 2*index, "%02X", (unsigned char) s->code_data[index] & 0xFF); + }; + buffer[2*(index+1)] = 0; + wrbuf_puts(subfield_name, "-"); + wrbuf_puts(subfield_name, buffer); + yaz_log(YLOG_WARN, "Using numeric value in element name: %s", buffer); + } + ptr_subfield = xmlNewTextChild(ptr, ns_record, + BAD_CAST wrbuf_cstr(subfield_name), + BAD_CAST wrbuf_cstr(wr_cdata)); } } wrbuf_destroy(subfield_name); @@ -828,7 +865,7 @@ int yaz_marc_write_turbo_xml(yaz_marc_t mt, xmlNode **root_ptr, switch(n->which) { case YAZ_MARC_DATAFIELD: - add_marc_datafield_xml2(mt, n, record_ptr, ns_record, wr_cdata, identifier_length); + add_marc_datafield_turbo_xml(mt, n, record_ptr, ns_record, wr_cdata, identifier_length); break; case YAZ_MARC_CONTROLFIELD: wrbuf_rewind(wr_cdata); @@ -1141,12 +1178,45 @@ int yaz_marc_decode_buf (yaz_marc_t mt, const char *buf, int bsize, return r; } +void yaz_marc_set_read_format(yaz_marc_t mt, int format) +{ + if (mt) + mt->input_format = format; +} + +int yaz_marc_get_read_format(yaz_marc_t mt) +{ + if (mt) + return mt->input_format; + return -1; +} + + +void yaz_marc_set_write_format(yaz_marc_t mt, int format) +{ + if (mt) + mt->output_format = format; +} + +int yaz_marc_get_write_format(yaz_marc_t mt) +{ + if (mt) + return mt->output_format; + return -1; +} + + +/** + * Deprecated, use yaz_marc_set_write_format + */ void yaz_marc_xml(yaz_marc_t mt, int xmlmode) { if (mt) - mt->xml = xmlmode; + mt->output_format = xmlmode; } + + void yaz_marc_debug(yaz_marc_t mt, int level) { if (mt) @@ -1259,6 +1329,11 @@ void yaz_marc_write_turbo_format(yaz_marc_t mt, int enable) mt->turbo_format = enable; } +int yaz_marc_is_turbo_format(yaz_marc_t mt) +{ + return mt->turbo_format; +} + /* * Local variables: diff --git a/src/record_conv.c b/src/record_conv.c index 6f65797..41e9881 100644 --- a/src/record_conv.c +++ b/src/record_conv.c @@ -472,7 +472,8 @@ static int yaz_record_conv_record_rule(yaz_record_conv_t p, else ret = -1; } - else if (r->u.marc.input_format == YAZ_MARC_MARCXML) + else if (r->u.marc.input_format == YAZ_MARC_MARCXML || + r->u.marc.input_format == YAZ_MARC_TMARCXML) { xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record), wrbuf_len(record)); @@ -483,7 +484,7 @@ static int yaz_record_conv_record_rule(yaz_record_conv_t p, } else { - ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc)); + ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc)); if (ret) wrbuf_printf(p->wr_error, "yaz_marc_read_xml failed"); } -- 1.7.10.4