1 /* $Id: xslt.c,v 1.8 2005-06-07 11:36:38 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
27 #include <yaz/diagbib1.h>
28 #include <libxml/xmlversion.h>
29 #include <libxml/parser.h>
30 #include <libxml/tree.h>
31 #include <libxml/xmlreader.h>
32 #include <libxslt/transform.h>
34 #include <idzebra/util.h>
35 #include <idzebra/recctrl.h>
37 struct filter_schema {
39 const char *identifier;
40 const char *stylesheet;
41 struct filter_schema *next;
42 const char *default_schema;
43 xsltStylesheetPtr stylesheet_xsp;
49 const char *split_level;
50 const char *split_path;
52 struct filter_schema *schemas;
53 xmlTextReaderPtr reader;
56 #define ZEBRA_INDEX_NS "http://indexdata.dk/zebra/indexing/1"
57 #define ZEBRA_SCHEMA_IDENTITY_NS "http://indexdata.dk/zebra/identity/1"
58 static const char *zebra_index_ns = ZEBRA_INDEX_NS;
60 static void set_param_xml(const char **params, const char *name,
61 const char *value, ODR odr)
70 static void set_param_str(const char **params, const char *name,
71 const char *value, ODR odr)
73 char *quoted = odr_malloc(odr, 3 + strlen(value));
74 sprintf(quoted, "'%s'", value);
82 static void set_param_int(const char **params, const char *name,
85 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
88 sprintf(quoted, "'" ZINT_FORMAT "'", value);
95 static void *filter_init_xslt(Res res, RecType recType)
97 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
100 tinfo->split_level = 0;
101 tinfo->split_path = 0;
102 tinfo->odr = odr_createmem(ODR_ENCODE);
108 static void *filter_init_xslt1(Res res, RecType recType)
110 struct filter_info *tinfo = (struct filter_info *)
111 filter_init_xslt(res, recType);
112 tinfo->split_level = "1";
116 static int attr_content(struct _xmlAttr *attr, const char *name,
117 const char **dst_content)
119 if (!strcmp(attr->name, name) && attr->children &&
120 attr->children->type == XML_TEXT_NODE)
122 *dst_content = attr->children->content;
128 static void destroy_schemas(struct filter_info *tinfo)
130 struct filter_schema *schema = tinfo->schemas;
133 struct filter_schema *schema_next = schema->next;
134 if (schema->stylesheet_xsp)
135 xsltFreeStylesheet(schema->stylesheet_xsp);
137 schema = schema_next;
142 xmlFreeDoc(tinfo->doc);
146 static ZEBRA_RES create_schemas(struct filter_info *tinfo, const char *fname)
149 tinfo->fname = xstrdup(fname);
150 tinfo->doc = xmlParseFile(tinfo->fname);
153 ptr = xmlDocGetRootElement(tinfo->doc);
154 if (!ptr || ptr->type != XML_ELEMENT_NODE ||
155 strcmp(ptr->name, "schemaInfo"))
157 for (ptr = ptr->children; ptr; ptr = ptr->next)
159 if (ptr->type != XML_ELEMENT_NODE)
161 if (!strcmp(ptr->name, "schema"))
163 struct _xmlAttr *attr;
164 struct filter_schema *schema = xmalloc(sizeof(*schema));
166 schema->identifier = 0;
167 schema->stylesheet = 0;
168 schema->default_schema = 0;
169 schema->next = tinfo->schemas;
170 schema->stylesheet_xsp = 0;
171 tinfo->schemas = schema;
172 for (attr = ptr->properties; attr; attr = attr->next)
174 attr_content(attr, "identifier", &schema->identifier);
175 attr_content(attr, "name", &schema->name);
176 attr_content(attr, "stylesheet", &schema->stylesheet);
177 attr_content(attr, "default", &schema->default_schema);
179 if (schema->stylesheet)
180 schema->stylesheet_xsp =
181 xsltParseStylesheetFile(
182 (const xmlChar*) schema->stylesheet);
184 else if (!strcmp(ptr->name, "split"))
186 struct _xmlAttr *attr;
187 for (attr = ptr->properties; attr; attr = attr->next)
189 attr_content(attr, "level", &tinfo->split_level);
190 attr_content(attr, "path", &tinfo->split_path);
195 yaz_log(YLOG_WARN, "Bad element %s in %s", ptr->name, fname);
202 static struct filter_schema *lookup_schema(struct filter_info *tinfo,
205 struct filter_schema *schema;
206 for (schema = tinfo->schemas; schema; schema = schema->next)
210 if (schema->identifier && !strcmp(schema->identifier, est))
212 if (schema->name && !strcmp(schema->name, est))
215 if (schema->default_schema)
221 static void filter_config(void *clientData, Res res, const char *args)
223 struct filter_info *tinfo = clientData;
225 args = "xsltfilter.xml";
226 if (tinfo->fname && !strcmp(args, tinfo->fname))
228 destroy_schemas(tinfo);
229 create_schemas(tinfo, args);
232 static void filter_destroy(void *clientData)
234 struct filter_info *tinfo = clientData;
235 destroy_schemas(tinfo);
237 xmlFreeTextReader(tinfo->reader);
238 odr_destroy(tinfo->odr);
242 static int ioread_ex(void *context, char *buffer, int len)
244 struct recExtractCtrl *p = context;
245 return (*p->readf)(p->fh, buffer, len);
248 static int ioclose_ex(void *context)
253 static void index_field(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
254 xmlNodePtr ptr, RecWord *recWord)
256 for(; ptr; ptr = ptr->next)
258 index_field(tinfo, ctrl, ptr->children, recWord);
259 if (ptr->type != XML_TEXT_NODE)
261 recWord->term_buf = ptr->content;
262 recWord->term_len = strlen(ptr->content);
263 (*ctrl->tokenAdd)(recWord);
267 static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
268 xmlNodePtr ptr, RecWord *recWord)
270 for(; ptr; ptr = ptr->next)
272 index_node(tinfo, ctrl, ptr->children, recWord);
273 if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
274 strcmp(ptr->ns->href, zebra_index_ns))
276 if (!strcmp(ptr->name, "index"))
279 const char *xpath_str = 0;
280 struct _xmlAttr *attr;
281 for (attr = ptr->properties; attr; attr = attr->next)
283 if (!strcmp(attr->name, "field")
284 && attr->children && attr->children->type == XML_TEXT_NODE)
285 field_str = attr->children->content;
286 if (!strcmp(attr->name, "xpath")
287 && attr->children && attr->children->type == XML_TEXT_NODE)
288 xpath_str = attr->children->content;
292 recWord->attrStr = field_str;
293 index_field(tinfo, ctrl, ptr->children, recWord);
299 static int extract_doc(struct filter_info *tinfo, struct recExtractCtrl *p,
303 const char *params[10];
307 struct filter_schema *schema = lookup_schema(tinfo, ZEBRA_INDEX_NS);
310 set_param_str(params, "schema", ZEBRA_INDEX_NS, tinfo->odr);
312 (*p->init)(p, &recWord);
313 recWord.reg_type = 'w';
315 if (schema && schema->stylesheet_xsp)
318 xsltApplyStylesheet(schema->stylesheet_xsp,
320 if (p->flagShowRecords)
322 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
323 fwrite(buf_out, len_out, 1, stdout);
326 index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord);
329 xmlDocDumpMemory(doc, &buf_out, &len_out);
330 if (p->flagShowRecords)
331 fwrite(buf_out, len_out, 1, stdout);
332 (*p->setStoreData)(p, buf_out, len_out);
336 return RECCTRL_EXTRACT_OK;
339 static int extract_split(struct filter_info *tinfo, struct recExtractCtrl *p)
346 xmlFreeTextReader(tinfo->reader);
347 tinfo->reader = xmlReaderForIO(ioread_ex, ioclose_ex,
354 return RECCTRL_EXTRACT_ERROR_GENERIC;
356 if (tinfo->split_level)
357 split_depth = atoi(tinfo->split_level);
358 ret = xmlTextReaderRead(tinfo->reader);
360 int type = xmlTextReaderNodeType(tinfo->reader);
361 int depth = xmlTextReaderDepth(tinfo->reader);
362 if (split_depth == 0 ||
364 type == XML_READER_TYPE_ELEMENT && split_depth == depth))
366 xmlNodePtr ptr = xmlTextReaderExpand(tinfo->reader);
367 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
368 xmlDocPtr doc = xmlNewDoc("1.0");
370 xmlDocSetRootElement(doc, ptr2);
372 return extract_doc(tinfo, p, doc);
374 ret = xmlTextReaderRead(tinfo->reader);
376 xmlFreeTextReader(tinfo->reader);
378 return RECCTRL_EXTRACT_EOF;
381 static int extract_full(struct filter_info *tinfo, struct recExtractCtrl *p)
383 if (p->first_record) /* only one record per stream */
385 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
391 return RECCTRL_EXTRACT_ERROR_GENERIC;
393 return extract_doc(tinfo, p, doc);
396 return RECCTRL_EXTRACT_EOF;
399 static int filter_extract(void *clientData, struct recExtractCtrl *p)
401 struct filter_info *tinfo = clientData;
403 odr_reset(tinfo->odr);
405 if (tinfo->split_level == 0 && tinfo->split_path == 0)
406 return extract_full(tinfo, p);
409 return extract_split(tinfo, p);
413 static int ioread_ret(void *context, char *buffer, int len)
415 struct recRetrieveCtrl *p = context;
416 return (*p->readf)(p->fh, buffer, len);
419 static int ioclose_ret(void *context)
425 static const char *snippet_doc(struct recRetrieveCtrl *p)
427 const char *xml_doc_str;
429 WRBUF wrbuf = wrbuf_alloc();
430 zebra_snippets *res =
431 zebra_snippets_window(p->doc_snippet, p->hit_snippet, 10);
432 zebra_snippet_word *w = zebra_snippets_list(res);
435 wrbuf_printf(wrbuf, "\'");
437 wrbuf_printf(wrbuf, "<snippet>\n");
439 for (; w; w = w->next)
443 else if (ord != w->ord)
446 wrbuf_printf(wrbuf, "%s%s%s ",
449 w->match ? "*" : "");
451 wrbuf_printf(wrbuf, " <term %s ord='%d' seqno='%d'>",
452 (w->match ? "match='1'" : ""),
454 wrbuf_xmlputs(wrbuf, w->term);
455 wrbuf_printf(wrbuf, "</term>\n");
459 wrbuf_printf(wrbuf, "\'");
461 wrbuf_printf(wrbuf, "</snippet>\n");
463 xml_doc_str = odr_strdup(p->odr, wrbuf_buf(wrbuf));
465 zebra_snippets_destroy(res);
466 wrbuf_free(wrbuf, 1);
470 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
472 const char *esn = ZEBRA_SCHEMA_IDENTITY_NS;
473 const char *params[10];
474 struct filter_info *tinfo = clientData;
477 struct filter_schema *schema;
481 if (p->comp->which != Z_RecordComp_simple
482 || p->comp->u.simple->which != Z_ElementSetNames_generic)
484 p->diagnostic = YAZ_BIB1_PRESENT_COMP_SPEC_PARAMETER_UNSUPP;
487 esn = p->comp->u.simple->u.generic;
489 schema = lookup_schema(tinfo, esn);
493 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
498 set_param_str(params, "schema", esn, p->odr);
500 set_param_str(params, "filename", p->fname, p->odr);
502 set_param_int(params, "score", p->score, p->odr);
503 set_param_int(params, "size", p->recordSize, p->odr);
505 set_param_xml(params, "snippet", snippet_doc(p), p->odr);
506 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
512 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
516 if (!schema->stylesheet_xsp)
520 resDoc = xsltApplyStylesheet(schema->stylesheet_xsp,
526 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
528 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
532 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
534 p->output_format = VAL_TEXT_XML;
535 p->rec_len = len_out;
536 p->rec_buf = odr_malloc(p->odr, p->rec_len);
537 memcpy(p->rec_buf, buf_out, p->rec_len);
541 else if (p->output_format == VAL_SUTRS)
545 xmlDocDumpMemory(resDoc, &buf_out, &len_out);
547 p->output_format = VAL_SUTRS;
548 p->rec_len = len_out;
549 p->rec_buf = odr_malloc(p->odr, p->rec_len);
550 memcpy(p->rec_buf, buf_out, p->rec_len);
556 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
562 static struct recType filter_type_xslt = {
572 static struct recType filter_type_xslt1 = {
583 #ifdef IDZEBRA_STATIC_XSLT
591 #ifdef LIBXML_READER_ENABLED