1 /* $Id: mod_dom.c,v 1.16 2007-02-18 21:53:22 adam Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
39 #include <libexslt/exslt.h>
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
56 const char *stylesheet;
57 xsltStylesheetPtr stylesheet_xsp;
58 struct convert_s *next;
61 struct filter_extract {
63 struct convert_s *convert;
67 struct convert_s *convert;
70 struct filter_retrieve {
72 const char *identifier;
73 struct convert_s *convert;
74 struct filter_retrieve *next;
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
82 struct convert_s *convert;
86 const char *input_charset;
91 xmlTextReaderPtr reader;
95 struct filter_input *next;
101 const char *profile_path;
104 xmlDocPtr doc_config;
105 struct filter_extract *extract;
106 struct filter_retrieve *retrieve_list;
107 struct filter_input *input_list;
108 struct filter_store *store;
111 #define XML_STRCMP(a,b) strcmp((char*)a, b)
112 #define XML_STRLEN(a) strlen((char*)a)
117 static void set_param_str(const char **params, const char *name,
118 const char *value, ODR odr)
120 char *quoted = odr_malloc(odr, 3 + strlen(value));
121 sprintf(quoted, "'%s'", value);
129 static void set_param_int(const char **params, const char *name,
132 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
135 sprintf(quoted, "'" ZINT_FORMAT "'", value);
141 static void *filter_init(Res res, RecType recType)
143 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
145 tinfo->full_name = 0;
146 tinfo->profile_path = 0;
147 tinfo->odr_record = odr_createmem(ODR_ENCODE);
148 tinfo->odr_config = odr_createmem(ODR_ENCODE);
150 tinfo->retrieve_list = 0;
151 tinfo->input_list = 0;
153 tinfo->doc_config = 0;
162 static int attr_content(struct _xmlAttr *attr, const char *name,
163 const char **dst_content)
165 if (!XML_STRCMP(attr->name, name) && attr->children
166 && attr->children->type == XML_TEXT_NODE)
168 *dst_content = (const char *)(attr->children->content);
174 static void destroy_xsp(struct convert_s *c)
178 if (c->stylesheet_xsp)
179 xsltFreeStylesheet(c->stylesheet_xsp);
184 static void destroy_dom(struct filter_info *tinfo)
188 destroy_xsp(tinfo->extract->convert);
193 destroy_xsp(tinfo->store->convert);
196 if (tinfo->input_list)
198 struct filter_input *i_ptr;
199 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
203 case DOM_INPUT_XMLREADER:
204 if (i_ptr->u.xmlreader.reader)
205 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
208 yaz_iconv_close(i_ptr->u.marc.iconv);
209 yaz_marc_destroy(i_ptr->u.marc.handle);
212 destroy_xsp(i_ptr->convert);
214 tinfo->input_list = 0;
216 if (tinfo->retrieve_list)
218 struct filter_retrieve *r_ptr;
219 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
220 destroy_xsp(r_ptr->convert);
221 tinfo->retrieve_list = 0;
224 if (tinfo->doc_config)
226 xmlFreeDoc(tinfo->doc_config);
227 tinfo->doc_config = 0;
229 odr_reset(tinfo->odr_config);
232 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
233 struct convert_s **l)
236 for(; ptr; ptr = ptr->next)
238 if (ptr->type != XML_ELEMENT_NODE)
240 if (!XML_STRCMP(ptr->name, "xslt"))
242 struct _xmlAttr *attr;
244 = odr_malloc(tinfo->odr_config, sizeof(*p));
248 p->stylesheet_xsp = 0;
250 for (attr = ptr->properties; attr; attr = attr->next)
251 if (attr_content(attr, "stylesheet", &p->stylesheet))
255 xmlChar *node_path = xmlGetNodePath(ptr);
256 yaz_log(YLOG_WARN, "%s: dom filter: "
257 "%s bad attribute @%s, "
258 "expected @stylesheet",
260 node_path, attr->name);
265 char tmp_xslt_full_name[1024];
266 if (!yaz_filepath_resolve(p->stylesheet,
271 yaz_log(YLOG_WARN, "%s: dom filter: "
272 "stylesheet %s not found in "
276 tinfo->profile_path);
281 = xsltParseStylesheetFile((const xmlChar*)
283 if (!p->stylesheet_xsp)
285 yaz_log(YLOG_WARN, "%s: dom filter: "
286 "could not parse xslt "
288 tinfo->fname, tmp_xslt_full_name);
294 xmlChar *node_path = xmlGetNodePath(ptr);
295 yaz_log(YLOG_WARN, "%s: dom filter: "
296 "%s missing attribute 'stylesheet' ",
297 tinfo->fname, node_path);
306 xmlChar *node_path = xmlGetNodePath(ptr);
310 tinfo->fname, node_path, ptr->name);
318 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
319 struct convert_s *convert,
322 xsltStylesheetPtr *last_xsp)
324 for (; convert; convert = convert->next)
326 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
329 *last_xsp = convert->stylesheet_xsp;
336 static struct filter_input *new_input(struct filter_info *tinfo, int type)
338 struct filter_input *p;
339 struct filter_input **np = &tinfo->input_list;
340 for (;*np; np = &(*np)->next)
342 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
351 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
355 for (; ptr; ptr = ptr->next)
357 if (ptr->type != XML_ELEMENT_NODE)
359 if (!XML_STRCMP(ptr->name, "marc"))
361 yaz_iconv_t iconv = 0;
362 const char *input_charset = "marc-8";
363 struct _xmlAttr *attr;
365 for (attr = ptr->properties; attr; attr = attr->next)
367 if (attr_content(attr, "charset", &input_charset))
371 xmlChar *node_path = xmlGetNodePath(ptr);
372 yaz_log(YLOG_WARN, "%s: dom filter: "
373 "%s bad attribute @%s,"
374 " expected @charset",
376 node_path, attr->name);
380 iconv = yaz_iconv_open("utf-8", input_charset);
383 xmlChar *node_path = xmlGetNodePath(ptr);
384 yaz_log(YLOG_WARN, "%s: dom filter: "
385 "%s unsupported @charset '%s'",
386 tinfo->fname, node_path,
393 struct filter_input *p
394 = new_input(tinfo, DOM_INPUT_MARC);
395 p->u.marc.handle = yaz_marc_create();
396 p->u.marc.iconv = iconv;
398 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
402 parse_convert(tinfo, ptr, &p->convert);
407 else if (!XML_STRCMP(ptr->name, "xmlreader"))
409 struct filter_input *p
410 = new_input(tinfo, DOM_INPUT_XMLREADER);
411 struct _xmlAttr *attr;
412 const char *level_str = 0;
414 p->u.xmlreader.split_level = 0;
415 p->u.xmlreader.reader = 0;
417 for (attr = ptr->properties; attr; attr = attr->next)
419 if (attr_content(attr, "level", &level_str))
423 xmlChar *node_path = xmlGetNodePath(ptr);
424 yaz_log(YLOG_WARN, "%s: dom filter: "
425 "%s bad attribute @%s,"
427 tinfo->fname, node_path,
433 p->u.xmlreader.split_level = atoi(level_str);
437 parse_convert(tinfo, ptr, &p->convert);
442 xmlChar *node_path = xmlGetNodePath(ptr);
443 yaz_log(YLOG_WARN, "%s: dom filter: "
444 "%s bad element <%s>,"
445 " expected <marc>|<xmlreader>",
446 tinfo->fname, node_path, ptr->name);
454 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
456 char tmp_full_name[1024];
460 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
462 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
463 NULL, tmp_full_name))
464 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
466 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
468 yaz_log(YLOG_LOG, "%s dom filter: "
469 "loading config file %s", tinfo->fname, tinfo->full_name);
471 doc = xmlParseFile(tinfo->full_name);
474 yaz_log(YLOG_WARN, "%s: dom filter: "
475 "failed to parse config file %s",
476 tinfo->fname, tinfo->full_name);
479 /* save because we store ptrs to the content */
480 tinfo->doc_config = doc;
482 ptr = xmlDocGetRootElement(doc);
483 if (!ptr || ptr->type != XML_ELEMENT_NODE
484 || XML_STRCMP(ptr->name, "dom"))
486 xmlChar *node_path = xmlGetNodePath(ptr);
487 yaz_log(YLOG_WARN, "%s: dom filter: "
488 "%s bad root element <%s>,"
489 " expected root element <dom>",
490 tinfo->fname, node_path, ptr->name);
495 for (ptr = ptr->children; ptr; ptr = ptr->next)
497 if (ptr->type != XML_ELEMENT_NODE)
499 if (!XML_STRCMP(ptr->name, "extract"))
502 <extract name="index">
503 <xslt stylesheet="first.xsl"/>
504 <xslt stylesheet="second.xsl"/>
507 struct _xmlAttr *attr;
508 struct filter_extract *f =
509 odr_malloc(tinfo->odr_config, sizeof(*f));
514 for (attr = ptr->properties; attr; attr = attr->next)
516 if (attr_content(attr, "name", &f->name))
520 xmlChar *node_path = xmlGetNodePath(ptr);
521 yaz_log(YLOG_WARN, "%s: dom filter: "
522 "%s bad attribute @%s"
525 node_path, attr->name);
529 parse_convert(tinfo, ptr->children, &f->convert);
531 else if (!XML_STRCMP(ptr->name, "retrieve"))
535 <xslt stylesheet="some.xsl"/>
536 <xslt stylesheet="some.xsl"/>
539 struct _xmlAttr *attr;
540 struct filter_retrieve **fp = &tinfo->retrieve_list;
541 struct filter_retrieve *f =
542 odr_malloc(tinfo->odr_config, sizeof(*f));
553 for (attr = ptr->properties; attr; attr = attr->next)
555 if (attr_content(attr, "identifier",
558 else if (attr_content(attr, "name", &f->name))
562 xmlChar *node_path = xmlGetNodePath(ptr);
563 yaz_log(YLOG_WARN, "%s: dom filter: "
564 "%s bad attribute @%s"
565 " expected @identifier|@name",
567 node_path, attr->name);
571 parse_convert(tinfo, ptr->children, &f->convert);
573 else if (!XML_STRCMP(ptr->name, "store"))
577 <xslt stylesheet="some.xsl"/>
578 <xslt stylesheet="some.xsl"/>
581 struct filter_store *f =
582 odr_malloc(tinfo->odr_config, sizeof(*f));
586 parse_convert(tinfo, ptr->children, &f->convert);
588 else if (!XML_STRCMP(ptr->name, "input"))
592 <xmlreader level="1"/>
594 <input syntax="usmarc">
595 <marc inputcharset="marc-8"/>
598 struct _xmlAttr *attr;
599 const char *syntax = 0;
600 const char *name = 0;
601 for (attr = ptr->properties; attr; attr = attr->next)
603 if (attr_content(attr, "syntax", &syntax))
605 else if (attr_content(attr, "name", &name))
609 xmlChar *node_path = xmlGetNodePath(ptr);
610 yaz_log(YLOG_WARN, "%s: dom filter: "
611 "%s bad attribute @%s"
612 " expected @syntax|@name",
614 node_path, attr->name);
618 parse_input(tinfo, ptr->children, syntax, name);
622 xmlChar *node_path = xmlGetNodePath(ptr);
623 yaz_log(YLOG_WARN, "%s: dom filter: "
624 "%s bad element <%s>,"
625 " expected <extract>|<input>|<retrieve>|<store>",
626 tinfo->fname, node_path, ptr->name);
634 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
637 struct filter_retrieve *f = tinfo->retrieve_list;
639 /* return first schema if no est is provided */
642 for (; f; f = f->next)
644 /* find requested schema */
647 if (f->identifier && !strcmp(f->identifier, est))
649 if (f->name && !strcmp(f->name, est))
656 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
658 struct filter_info *tinfo = clientData;
661 yaz_log(YLOG_WARN, "dom filter: need config file");
665 if (tinfo->fname && !strcmp(args, tinfo->fname))
668 tinfo->profile_path = res_get(res, "profilePath");
671 return parse_dom(tinfo, args);
674 static void filter_destroy(void *clientData)
676 struct filter_info *tinfo = clientData;
678 odr_destroy(tinfo->odr_config);
679 odr_destroy(tinfo->odr_record);
683 static int ioread_ex(void *context, char *buffer, int len)
685 struct recExtractCtrl *p = context;
686 return p->stream->readf(p->stream, buffer, len);
689 static int ioclose_ex(void *context)
695 /* DOM filter style indexing */
696 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
697 xmlChar **dst_content)
699 if (0 == XML_STRCMP(attr->name, name) && attr->children
700 && attr->children->type == XML_TEXT_NODE)
702 *dst_content = (attr->children->content);
709 /* DOM filter style indexing */
710 static void index_value_of(struct filter_info *tinfo,
711 struct recExtractCtrl *extctr,
716 xmlChar *text = xmlNodeGetContent(node);
717 size_t text_len = strlen((const char *)text);
720 /* if there is no text, we do not need to proceed */
723 xmlChar *look = index_p;
730 /* assingning text to be indexed */
731 recword->term_buf = (const char *)text;
732 recword->term_len = text_len;
734 /* parsing all index name/type pairs */
735 /* may not start with ' ' or ':' */
736 while (*look && ' ' != *look && ':' != *look)
738 /* setting name and type to zero */
742 /* parsing one index name */
744 while (*look && ':' != *look && ' ' != *look)
749 strncpy((char *)index, (const char *)bval, eval - bval);
750 index[eval - bval] = '\0';
753 /* parsing one index type, if existing */
759 while (*look && ' ' != *look)
764 strncpy((char *)type, (const char *)bval, eval - bval);
765 type[eval - bval] = '\0';
768 /* actually indexing the text given */
769 yaz_log(YLOG_DEBUG, "%s dom filter: "
770 "INDEX '%s:%s' '%s'",
771 tinfo->fname, index, type, text);
773 recword->index_name = (const char *)index;
775 recword->index_type = *type;
776 (extctr->tokenAdd)(recword);
778 /* eat whitespaces */
779 if (*look && ' ' == *look && *(look+1))
790 /* DOM filter style indexing */
791 static void set_record_info(struct filter_info *tinfo,
792 struct recExtractCtrl *extctr,
797 yaz_log(YLOG_DEBUG, "%s dom filter: "
798 "RECORD id=%s rank=%s type=%s",
799 tinfo->fname, id_p, rank_p, type_p);
802 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
805 extctr->staticrank = atozint((const char *)rank_p);
807 /* if (!strcmp("update", type_str)) */
808 /* index_node(tinfo, ctrl, ptr, recword); */
809 /* else if (!strcmp("delete", type_str)) */
810 /* yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
812 /* yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", */
818 /* DOM filter style indexing */
819 static void process_xml_element_zebra_node(struct filter_info *tinfo,
820 struct recExtractCtrl *extctr,
824 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
825 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
827 if (0 == XML_STRCMP(node->name, "index"))
829 xmlChar *index_p = 0;
831 struct _xmlAttr *attr;
832 for (attr = node->properties; attr; attr = attr->next)
834 if (attr_content_xml(attr, "name", &index_p))
836 index_value_of(tinfo, extctr, recword,node, index_p);
840 xmlChar *node_path = xmlGetNodePath(node);
841 yaz_log(YLOG_WARN,"%s dom filter: "
842 "%s bad attribute @%s, expected @name",
843 tinfo->fname, node_path, attr->name);
848 else if (0 == XML_STRCMP(node->name, "record"))
854 struct _xmlAttr *attr;
855 for (attr = node->properties; attr; attr = attr->next)
857 if (attr_content_xml(attr, "id", &id_p))
859 else if (attr_content_xml(attr, "rank", &rank_p))
861 else if (attr_content_xml(attr, "type", &type_p))
865 xmlChar *node_path = xmlGetNodePath(node);
866 yaz_log(YLOG_WARN,"%s dom filter: "
867 "%s bad attribute @%s,"
868 " expected @id|@rank|@type",
869 tinfo->fname, node_path, attr->name);
873 if (type_p && 0 != strcmp("update", (const char *)type_p))
875 xmlChar *node_path = xmlGetNodePath(node);
876 yaz_log(YLOG_WARN,"%s dom filter: "
878 " only implemented '@type='update'",
879 tinfo->fname, node_path, attr->name);
885 set_record_info(tinfo, extctr, id_p, rank_p, type_p);
889 xmlChar *node_path = xmlGetNodePath(node);
890 yaz_log(YLOG_WARN,"%s dom filter: "
891 "%s bad element <%s>,"
892 " expected <record>|<index> in namespace '%s'",
893 tinfo->fname, node_path,
894 node->name, zebra_dom_ns);
901 /* DOM filter style indexing */
902 static void process_xml_pi_node(struct filter_info *tinfo,
903 struct recExtractCtrl *extctr,
907 /* if right PI name, continue parsing PI */
908 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
910 xmlChar *pi_p = node->content;
911 xmlChar *look = pi_p;
916 /* parsing PI record instructions */
917 if (0 == strncmp((const char *)look, "record", 6))
930 while (*look && ' ' == *look && *(look+1))
933 /* parse possible id */
934 if (*look && 0 == strncmp((const char *)look, "id=", 3))
938 while (*look && ' ' != *look)
941 strncpy((char *)id, (const char *)bval, eval - bval);
942 id[eval - bval] = '\0';
946 while (*look && ' ' == *look && *(look+1))
949 /* parse possible rank */
950 if (*look && 0 == strncmp((const char *)look, "rank=", 5))
954 while (*look && ' ' != *look)
957 strncpy((char *)rank, (const char *)bval, eval - bval);
958 rank[eval - bval] = '\0';
962 while (*look && ' ' == *look && *(look+1))
965 if (look && '\0' != *look)
967 xmlChar *node_path = xmlGetNodePath(node);
968 yaz_log(YLOG_WARN,"%s dom filter: "
969 "%s content '%s', can not parse '%s'",
970 tinfo->fname, node_path, pi_p, look);
974 set_record_info(tinfo, extctr, id, rank, 0);
977 /* parsing index instruction */
978 else if (0 == strncmp((const char *)look, "index", 5))
983 while (*look && ' ' == *look && *(look+1))
986 /* export index instructions to outside */
991 xmlChar *node_path = xmlGetNodePath(node);
992 yaz_log(YLOG_WARN,"%s dom filter: "
993 "%s content '%s', can not parse '%s'",
994 tinfo->fname, node_path, pi_p, look);
1000 /* DOM filter style indexing */
1001 static void process_xml_element_node(struct filter_info *tinfo,
1002 struct recExtractCtrl *extctr,
1006 /* remember indexing instruction from PI to next element node */
1007 xmlChar *index_p = 0;
1009 /* check if we are an element node in the special zebra namespace
1010 and either set record data or index value-of node content*/
1011 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1013 /* loop through kid nodes */
1014 for (node = node->children; node; node = node->next)
1016 /* check and set PI record and index index instructions */
1017 if (node->type == XML_PI_NODE)
1019 process_xml_pi_node(tinfo, extctr, node, &index_p);
1021 else if (node->type == XML_ELEMENT_NODE)
1023 /* if there was a PI index instruction before this element */
1026 index_value_of(tinfo, extctr, recword, node, index_p);
1029 process_xml_element_node(tinfo, extctr, recword,node);
1037 /* DOM filter style indexing */
1038 static void extract_dom_doc_node(struct filter_info *tinfo,
1039 struct recExtractCtrl *extctr,
1045 /* only need to do the initialization once, reuse recword for all terms */
1047 (*extctr->init)(extctr, &recword);
1049 if (extctr->flagShowRecords)
1051 xmlDocDumpMemory(doc, &buf_out, &len_out);
1052 fwrite(buf_out, len_out, 1, stdout);
1056 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1062 static int convert_extract_doc(struct filter_info *tinfo,
1063 struct filter_input *input,
1064 struct recExtractCtrl *p,
1070 const char *params[10];
1071 xsltStylesheetPtr last_xsp = 0;
1072 xmlDocPtr store_doc = 0;
1075 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1077 /* input conversion */
1078 perform_convert(tinfo, input->convert, params, &doc, 0);
1082 /* store conversion */
1083 store_doc = xmlCopyDoc(doc, 1);
1084 perform_convert(tinfo, tinfo->store->convert,
1085 params, &store_doc, &last_xsp);
1089 xsltSaveResultToString(&buf_out, &len_out,
1090 store_doc ? store_doc : doc, last_xsp);
1092 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1093 if (p->flagShowRecords)
1094 fwrite(buf_out, len_out, 1, stdout);
1095 (*p->setStoreData)(p, buf_out, len_out);
1099 xmlFreeDoc(store_doc);
1101 /* extract conversion */
1102 perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1104 /* finally, do the indexing */
1107 extract_dom_doc_node(tinfo, p, doc);
1108 /* extract_doc_alvis(tinfo, p, doc); */
1112 return RECCTRL_EXTRACT_OK;
1115 static int extract_xml_split(struct filter_info *tinfo,
1116 struct filter_input *input,
1117 struct recExtractCtrl *p)
1121 if (p->first_record)
1123 if (input->u.xmlreader.reader)
1124 xmlFreeTextReader(input->u.xmlreader.reader);
1125 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1126 p /* I/O handler */,
1132 if (!input->u.xmlreader.reader)
1133 return RECCTRL_EXTRACT_ERROR_GENERIC;
1135 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1138 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1139 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1140 if (type == XML_READER_TYPE_ELEMENT &&
1141 input->u.xmlreader.split_level == depth)
1144 = xmlTextReaderExpand(input->u.xmlreader.reader);
1147 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1148 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1150 xmlDocSetRootElement(doc, ptr2);
1152 return convert_extract_doc(tinfo, input, p, doc);
1156 xmlFreeTextReader(input->u.xmlreader.reader);
1157 input->u.xmlreader.reader = 0;
1158 return RECCTRL_EXTRACT_ERROR_GENERIC;
1161 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1163 xmlFreeTextReader(input->u.xmlreader.reader);
1164 input->u.xmlreader.reader = 0;
1165 return RECCTRL_EXTRACT_EOF;
1168 static int extract_xml_full(struct filter_info *tinfo,
1169 struct filter_input *input,
1170 struct recExtractCtrl *p)
1172 if (p->first_record) /* only one record per stream */
1174 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1175 p /* I/O handler */,
1178 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1181 return RECCTRL_EXTRACT_ERROR_GENERIC;
1183 return convert_extract_doc(tinfo, input, p, doc);
1186 return RECCTRL_EXTRACT_EOF;
1189 static int extract_iso2709(struct filter_info *tinfo,
1190 struct filter_input *input,
1191 struct recExtractCtrl *p)
1197 if (p->stream->readf(p->stream, buf, 5) != 5)
1198 return RECCTRL_EXTRACT_EOF;
1199 while (*buf < '0' || *buf > '9')
1203 yaz_log(YLOG_WARN, "%s dom filter: "
1204 "MARC: Skipping bad byte %d (0x%02X)",
1205 tinfo->fname, *buf & 0xff, *buf & 0xff);
1206 for (i = 0; i<4; i++)
1209 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1210 return RECCTRL_EXTRACT_EOF;
1212 record_length = atoi_n (buf, 5);
1213 if (record_length < 25)
1215 yaz_log (YLOG_WARN, "%s dom filter: "
1216 "MARC record length < 25, is %d",
1217 tinfo->fname, record_length);
1218 return RECCTRL_EXTRACT_ERROR_GENERIC;
1220 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1221 if (read_bytes < record_length-5)
1223 yaz_log (YLOG_WARN, "%s dom filter: "
1224 "Couldn't read whole MARC record",
1226 return RECCTRL_EXTRACT_ERROR_GENERIC;
1228 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1229 if (r < record_length)
1231 yaz_log (YLOG_WARN, "%s dom filter: "
1232 "Parsing of MARC record failed r=%d length=%d",
1233 tinfo->fname, r, record_length);
1234 return RECCTRL_EXTRACT_ERROR_GENERIC;
1240 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1241 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1242 xmlDocSetRootElement(rdoc, root_ptr);
1243 return convert_extract_doc(tinfo, input, p, rdoc);
1245 return RECCTRL_EXTRACT_OK;
1248 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1250 struct filter_info *tinfo = clientData;
1251 struct filter_input *input = tinfo->input_list;
1254 return RECCTRL_EXTRACT_ERROR_GENERIC;
1256 odr_reset(tinfo->odr_record);
1259 case DOM_INPUT_XMLREADER:
1260 if (input->u.xmlreader.split_level == 0)
1261 return extract_xml_full(tinfo, input, p);
1263 return extract_xml_split(tinfo, input, p);
1265 case DOM_INPUT_MARC:
1266 return extract_iso2709(tinfo, input, p);
1268 return RECCTRL_EXTRACT_ERROR_GENERIC;
1271 static int ioread_ret(void *context, char *buffer, int len)
1273 struct recRetrieveCtrl *p = context;
1274 return p->stream->readf(p->stream, buffer, len);
1277 static int ioclose_ret(void *context)
1282 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1284 /* const char *esn = zebra_dom_ns; */
1285 const char *esn = 0;
1286 const char *params[32];
1287 struct filter_info *tinfo = clientData;
1289 struct filter_retrieve *retrieve;
1290 xsltStylesheetPtr last_xsp = 0;
1294 if (p->comp->which == Z_RecordComp_simple
1295 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1297 esn = p->comp->u.simple->u.generic;
1299 else if (p->comp->which == Z_RecordComp_complex
1300 && p->comp->u.complex->generic->elementSpec
1301 && p->comp->u.complex->generic->elementSpec->which ==
1302 Z_ElementSpec_elementSetName)
1304 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1307 retrieve = lookup_retrieve(tinfo, esn);
1311 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1316 set_param_int(params, "id", p->localno, p->odr);
1318 set_param_str(params, "filename", p->fname, p->odr);
1319 if (p->staticrank >= 0)
1320 set_param_int(params, "rank", p->staticrank, p->odr);
1323 set_param_str(params, "schema", esn, p->odr);
1326 set_param_str(params, "schema", retrieve->name, p->odr);
1327 else if (retrieve->identifier)
1328 set_param_str(params, "schema", retrieve->identifier, p->odr);
1330 set_param_str(params, "schema", "", p->odr);
1333 set_param_int(params, "score", p->score, p->odr);
1334 set_param_int(params, "size", p->recordSize, p->odr);
1336 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1339 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1342 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1346 /* retrieve conversion */
1347 perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1350 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1352 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1358 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1360 xmlDocDumpMemory(doc, &buf_out, &len_out);
1362 p->output_format = VAL_TEXT_XML;
1363 p->rec_len = len_out;
1364 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1365 memcpy(p->rec_buf, buf_out, p->rec_len);
1368 else if (p->output_format == VAL_SUTRS)
1374 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1376 xmlDocDumpMemory(doc, &buf_out, &len_out);
1378 p->output_format = VAL_SUTRS;
1379 p->rec_len = len_out;
1380 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1381 memcpy(p->rec_buf, buf_out, p->rec_len);
1387 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1393 static struct recType filter_type = {
1404 #ifdef IDZEBRA_STATIC_DOM
1417 * indent-tabs-mode: nil
1419 * vim: shiftwidth=4 tabstop=8 expandtab