2 * Copyright (C) 1994-2002, Index Data
5 * $Id: recgrs.c,v 1.58 2002-08-02 10:07:48 adam Exp $
10 #include <sys/types.h>
21 #define GRS_MAX_WORD 512
27 struct grs_handler *next;
31 struct grs_handler *handlers;
34 static int read_grs_type (struct grs_handlers *h,
35 struct grs_read_info *p, const char *type,
38 struct grs_handler *gh = h->handlers;
39 const char *cp = strchr (type, '.');
41 if (cp == NULL || cp == type)
43 cp = strlen(type) + type;
47 strcpy (p->type, cp+1);
48 for (gh = h->handlers; gh; gh = gh->next)
50 if (!memcmp (type, gh->type->type, cp-type))
55 gh->clientData = (*gh->type->init)();
57 p->clientData = gh->clientData;
58 *root = (gh->type->read)(p);
59 gh->clientData = p->clientData;
66 static void grs_add_handler (struct grs_handlers *h, RecTypeGrs t)
68 struct grs_handler *gh = (struct grs_handler *) xmalloc (sizeof(*gh));
69 gh->next = h->handlers;
76 static void *grs_init(RecType recType)
78 struct grs_handlers *h = (struct grs_handlers *) xmalloc (sizeof(*h));
81 grs_add_handler (h, recTypeGrs_sgml);
82 grs_add_handler (h, recTypeGrs_regx);
84 grs_add_handler (h, recTypeGrs_tcl);
86 grs_add_handler (h, recTypeGrs_marc);
88 grs_add_handler (h, recTypeGrs_xml);
93 static void grs_destroy(void *clientData)
95 struct grs_handlers *h = (struct grs_handlers *) clientData;
96 struct grs_handler *gh = h->handlers, *gh_next;
101 (*gh->type->destroy)(gh->clientData);
109 1 start element (tag)
111 3 start attr (and attr-exact)
118 static void index_xpath (data1_node *n, struct recExtractCtrl *p,
119 int level, RecWord *wrd, int use)
122 char tag_path_full[1024];
130 wrd->string = n->u.data.data;
131 wrd->length = n->u.data.len;
132 wrd->attrSet = VAL_IDXPATH,
134 if (p->flagShowRecords)
136 printf("%*s data=", (level + 1) * 4, "");
137 for (i = 0; i<wrd->length && i < 8; i++)
138 fputc (wrd->string[i], stdout);
147 for (nn = n; nn; nn = nn->parent)
149 if (nn->which == DATA1N_tag)
151 size_t tlen = strlen(nn->u.tag.tag);
152 if (tlen + flen > (sizeof(tag_path_full)-2))
154 memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);
156 tag_path_full[flen++] = '/';
158 else if (nn->which == DATA1N_root)
162 wrd->string = tag_path_full;
164 wrd->attrSet = VAL_IDXPATH;
166 if (p->flagShowRecords)
168 printf("%*s tag=", (level + 1) * 4, "");
169 for (i = 0; i<wrd->length && i < 40; i++)
170 fputc (wrd->string[i], stdout);
178 (*p->tokenAdd)(wrd); /* index element pag (AKA tag path) */
181 for (xp = n->u.tag.attributes; xp; xp = xp->next)
184 /* attribute (no value) */
187 wrd->string = xp->name;
188 wrd->length = strlen(xp->name);
194 strlen(xp->name) + strlen(xp->value) < sizeof(comb)-2)
196 /* attribute value exact */
197 strcpy (comb, xp->name);
199 strcat (comb, xp->value);
204 wrd->length = strlen(comb);
210 for (xp = n->u.tag.attributes; xp; xp = xp->next)
212 char attr_tag_path_full[1024];
214 sprintf (attr_tag_path_full, "@%s/%.*s",
215 xp->name, flen, tag_path_full);
219 wrd->string = attr_tag_path_full;
220 wrd->length = strlen(attr_tag_path_full);
225 wrd->string = xp->value;
226 wrd->length = strlen(xp->value);
232 wrd->string = attr_tag_path_full;
233 wrd->length = strlen(attr_tag_path_full);
241 static void index_termlist (data1_node *par, data1_node *n,
242 struct recExtractCtrl *p, int level, RecWord *wrd)
244 data1_termlist *tlist = 0;
245 data1_datatype dtype = DATA1K_string;
247 * cycle up towards the root until we find a tag with an att..
248 * this has the effect of indexing locally defined tags with
249 * the attribute of their ancestor in the record.
252 while (!par->u.tag.element)
253 if (!par->parent || !(par=get_parent_tag(p->dh, par->parent)))
255 if (!par || !(tlist = par->u.tag.element->termlists))
257 if (par->u.tag.element->tag)
258 dtype = par->u.tag.element->tag->kind;
260 for (; tlist; tlist = tlist->next)
263 /* consider source */
266 if (!strcmp (tlist->source, "data") && n->which == DATA1N_data)
268 wrd->string = n->u.data.data;
269 wrd->length = n->u.data.len;
271 else if (!strcmp (tlist->source, "tag") && n->which == DATA1N_tag)
273 wrd->string = n->u.tag.tag;
274 wrd->length = strlen(n->u.tag.tag);
276 else if (sscanf (tlist->source, "attr(%511[^)])", xattr) == 1 &&
277 n->which == DATA1N_tag)
279 data1_xattr *p = n->u.tag.attributes;
280 while (p && strcmp (p->name, xattr))
284 wrd->string = p->value;
285 wrd->length = strlen(p->value);
290 if (p->flagShowRecords)
293 printf("%*sIdx: [%s]", (level + 1) * 4, "",
295 printf("%s:%s [%d] %s",
296 tlist->att->parent->name,
297 tlist->att->name, tlist->att->value,
300 for (i = 0; i<wrd->length && i < 8; i++)
301 fputc (wrd->string[i], stdout);
305 fputc ('\n', stdout);
309 wrd->reg_type = *tlist->structure;
310 wrd->attrSet = (int) (tlist->att->parent->reference);
311 wrd->attrUse = tlist->att->locals->local;
318 static int dumpkeys(data1_node *n, struct recExtractCtrl *p, int level,
321 for (; n; n = n->next)
323 if (p->flagShowRecords) /* display element description to user */
325 if (n->which == DATA1N_root)
327 printf("%*s", level * 4, "");
328 printf("Record type: '%s'\n", n->u.root.type);
330 else if (n->which == DATA1N_tag)
334 printf("%*s", level * 4, "");
335 if (!(e = n->u.tag.element))
336 printf("Local tag: '%s'\n", n->u.tag.tag);
339 printf("Elm: '%s' ", e->name);
342 data1_tag *t = e->tag;
344 printf("TagNam: '%s' ", t->names->name);
347 printf("%s[%d],", t->tagset->name, t->tagset->type);
350 if (t->which == DATA1T_numeric)
351 printf("%d)", t->value.numeric);
353 printf("'%s')", t->value.string);
360 if (n->which == DATA1N_tag)
362 index_termlist (n, n, p, level, wrd);
363 /* index start tag */
364 if (!n->root->u.root.absyn)
365 index_xpath (n, p, level, wrd, 1);
369 if (dumpkeys(n->child, p, level + 1, wrd) < 0)
373 if (n->which == DATA1N_data)
375 data1_node *par = get_parent_tag(p->dh, n);
377 if (p->flagShowRecords)
379 printf("%*s", level * 4, "");
381 if (n->u.data.len > 32)
382 printf("'%.24s ... %.6s'\n", n->u.data.data,
383 n->u.data.data + n->u.data.len-6);
384 else if (n->u.data.len > 0)
385 printf("'%.*s'\n", n->u.data.len, n->u.data.data);
391 index_termlist (par, n, p, level, wrd);
392 if (!n->root->u.root.absyn)
393 index_xpath (n, p, level, wrd, 1016);
397 if (n->which == DATA1N_tag)
400 if (!n->root->u.root.absyn)
401 index_xpath (n, p, level, wrd, 2);
405 if (p->flagShowRecords && n->which == DATA1N_root)
407 printf("%*s-------------\n\n", level * 4, "");
413 int grs_extract_tree(struct recExtractCtrl *p, data1_node *n)
416 int oidtmp[OID_SIZE];
419 oe.proto = PROTO_Z3950;
420 oe.oclass = CLASS_SCHEMA;
423 oe.value = n->u.root.absyn->reference;
425 if ((oid_ent_to_oid (&oe, oidtmp)))
426 (*p->schemaAdd)(p, oidtmp);
430 return dumpkeys(n, p, 0, &wrd);
433 static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p,
437 struct grs_read_info gri;
439 int oidtmp[OID_SIZE];
442 gri.readf = p->readf;
443 gri.seekf = p->seekf;
444 gri.tellf = p->tellf;
447 gri.offset = p->offset;
451 if (read_grs_type (h, &gri, p->subType, &n))
452 return RECCTRL_EXTRACT_ERROR;
454 return RECCTRL_EXTRACT_EOF;
455 oe.proto = PROTO_Z3950;
456 oe.oclass = CLASS_SCHEMA;
458 if (!n->u.root.absyn)
459 return RECCTRL_EXTRACT_ERROR;
463 oe.value = n->u.root.absyn->reference;
464 if ((oid_ent_to_oid (&oe, oidtmp)))
465 (*p->schemaAdd)(p, oidtmp);
468 /* ensure our data1 tree is UTF-8 */
469 data1_iconv (p->dh, mem, n, "UTF-8", data1_get_encoding(p->dh, n));
472 data1_pr_tree (p->dh, n, stdout);
476 if (dumpkeys(n, p, 0, &wrd) < 0)
478 data1_free_tree(p->dh, n);
479 return RECCTRL_EXTRACT_ERROR;
481 data1_free_tree(p->dh, n);
482 return RECCTRL_EXTRACT_OK;
485 static int grs_extract(void *clientData, struct recExtractCtrl *p)
488 NMEM mem = nmem_create ();
489 struct grs_handlers *h = (struct grs_handlers *) clientData;
491 ret = grs_extract_sub(h, p, mem);
497 * Return: -1: Nothing done. 0: Ok. >0: Bib-1 diagnostic.
499 static int process_comp(data1_handle dh, data1_node *n, Z_RecordComposition *c)
501 data1_esetname *eset;
507 case Z_RecordComp_simple:
508 if (c->u.simple->which != Z_ElementSetNames_generic)
509 return 26; /* only generic form supported. Fix this later */
510 if (!(eset = data1_getesetbyname(dh, n->u.root.absyn,
511 c->u.simple->u.generic)))
513 logf(LOG_LOG, "Unknown esetname '%s'", c->u.simple->u.generic);
514 return 25; /* invalid esetname */
516 logf(LOG_DEBUG, "Esetname '%s' in simple compspec",
517 c->u.simple->u.generic);
520 case Z_RecordComp_complex:
521 if (c->u.complex->generic)
523 /* insert check for schema */
524 if ((p = c->u.complex->generic->elementSpec))
528 case Z_ElementSpec_elementSetName:
530 data1_getesetbyname(dh, n->u.root.absyn,
531 p->u.elementSetName)))
533 logf(LOG_LOG, "Unknown esetname '%s'",
534 p->u.elementSetName);
535 return 25; /* invalid esetname */
537 logf(LOG_DEBUG, "Esetname '%s' in complex compspec",
538 p->u.elementSetName);
541 case Z_ElementSpec_externalSpec:
542 if (p->u.externalSpec->which == Z_External_espec1)
544 logf(LOG_DEBUG, "Got Espec-1");
545 espec = p->u.externalSpec-> u.espec1;
549 logf(LOG_LOG, "Unknown external espec.");
550 return 25; /* bad. what is proper diagnostic? */
561 logf (LOG_DEBUG, "Element: Espec-1 match");
562 return data1_doespec1(dh, n, espec);
566 logf (LOG_DEBUG, "Element: all match");
571 static void add_idzebra_info (struct recRetrieveCtrl *p, data1_node *top,
574 const char *idzebra_ns[7];
576 idzebra_ns[0] = "xmlns:idzebra";
577 idzebra_ns[1] = "http://www.indexdata.dk/zebra/";
580 data1_tag_add_attr (p->dh, mem, top, idzebra_ns);
582 data1_mk_tag_data_int (p->dh, top, "idzebra:size", p->recordSize,
585 data1_mk_tag_data_int (p->dh, top, "idzebra:score",
588 data1_mk_tag_data_int (p->dh, top, "idzebra:localnumber", p->localno,
591 data1_mk_tag_data_text(p->dh, top, "idzebra:filename",
595 static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p)
597 data1_node *node = 0, *onode = 0, *top;
600 int res, selected = 0;
602 struct grs_read_info gri;
604 struct grs_handlers *h = (struct grs_handlers *) clientData;
605 int requested_schema = VAL_NONE;
606 data1_marctab *marctab;
610 gri.readf = p->readf;
611 gri.seekf = p->seekf;
612 gri.tellf = p->tellf;
619 logf (LOG_DEBUG, "grs_retrieve");
620 if (read_grs_type (h, &gri, p->subType, &node))
632 /* ensure our data1 tree is UTF-8 */
633 data1_iconv (p->dh, mem, node, "UTF-8", data1_get_encoding(p->dh, node));
636 data1_pr_tree (p->dh, node, stdout);
638 top = data1_get_root_tag (p->dh, node);
640 logf (LOG_DEBUG, "grs_retrieve: size");
641 if ((dnew = data1_mk_tag_data_wd(p->dh, top, "size", mem)))
643 dnew->u.data.what = DATA1I_text;
644 dnew->u.data.data = dnew->lbuf;
645 sprintf(dnew->u.data.data, "%d", p->recordSize);
646 dnew->u.data.len = strlen(dnew->u.data.data);
649 tagname = res_get_def(p->res, "tagrank", "rank");
650 if (strcmp(tagname, "0") && p->score >= 0 &&
651 (dnew = data1_mk_tag_data_wd(p->dh, top, tagname, mem)))
653 logf (LOG_DEBUG, "grs_retrieve: %s", tagname);
654 dnew->u.data.what = DATA1I_num;
655 dnew->u.data.data = dnew->lbuf;
656 sprintf(dnew->u.data.data, "%d", p->score);
657 dnew->u.data.len = strlen(dnew->u.data.data);
660 tagname = res_get_def(p->res, "tagsysno", "localControlNumber");
661 if (strcmp(tagname, "0") && p->localno > 0 &&
662 (dnew = data1_mk_tag_data_wd(p->dh, top, tagname, mem)))
664 logf (LOG_DEBUG, "grs_retrieve: %s", tagname);
665 dnew->u.data.what = DATA1I_text;
666 dnew->u.data.data = dnew->lbuf;
668 sprintf(dnew->u.data.data, "%d", p->localno);
669 dnew->u.data.len = strlen(dnew->u.data.data);
672 data1_pr_tree (p->dh, node, stdout);
674 if (p->comp && p->comp->which == Z_RecordComp_complex &&
675 p->comp->u.complex->generic &&
676 p->comp->u.complex->generic->schema)
678 oident *oe = oid_getentbyoid (p->comp->u.complex->generic->schema);
680 requested_schema = oe->value;
683 /* If schema has been specified, map if possible, then check that
684 * we got the right one
686 if (requested_schema != VAL_NONE)
688 logf (LOG_DEBUG, "grs_retrieve: schema mapping");
689 for (map = node->u.root.absyn->maptabs; map; map = map->next)
691 if (map->target_absyn_ref == requested_schema)
694 if (!(node = data1_map_record(p->dh, onode, map, mem)))
703 if (node->u.root.absyn &&
704 requested_schema != node->u.root.absyn->reference)
712 * Does the requested format match a known syntax-mapping? (this reflects
713 * the overlap of schema and formatting which is inherent in the MARC
716 yaz_log (LOG_DEBUG, "grs_retrieve: syntax mapping");
717 if (node->u.root.absyn)
718 for (map = node->u.root.absyn->maptabs; map; map = map->next)
720 if (map->target_absyn_ref == p->input_format)
723 if (!(node = data1_map_record(p->dh, onode, map, mem)))
732 yaz_log (LOG_DEBUG, "grs_retrieve: schemaIdentifier");
733 if (node->u.root.absyn &&
734 node->u.root.absyn->reference != VAL_NONE &&
735 p->input_format == VAL_GRS1)
739 int oidtmp[OID_SIZE];
741 oe.proto = PROTO_Z3950;
742 oe.oclass = CLASS_SCHEMA;
743 oe.value = node->u.root.absyn->reference;
745 if ((oid = oid_ent_to_oid (&oe, oidtmp)))
748 data1_handle dh = p->dh;
752 for (ii = oid; *ii >= 0; ii++)
756 sprintf(p, "%d", *ii);
761 if ((dnew = data1_mk_tag_data_wd(dh, node,
762 "schemaIdentifier", mem)))
764 dnew->u.data.what = DATA1I_oid;
765 dnew->u.data.data = (char *) nmem_malloc(mem, p - tmp);
766 memcpy(dnew->u.data.data, tmp, p - tmp);
767 dnew->u.data.len = p - tmp;
772 logf (LOG_DEBUG, "grs_retrieve: element spec");
773 if (p->comp && (res = process_comp(p->dh, node, p->comp)) > 0)
777 data1_free_tree(p->dh, onode);
778 data1_free_tree(p->dh, node);
782 else if (p->comp && !res)
786 data1_pr_tree (p->dh, node, stdout);
788 logf (LOG_DEBUG, "grs_retrieve: transfer syntax mapping");
789 switch (p->output_format = (p->input_format != VAL_NONE ?
790 p->input_format : VAL_SUTRS))
793 add_idzebra_info (p, top, mem);
796 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
798 if (!(p->rec_buf = data1_nodetoidsgml(p->dh, node, selected,
803 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
804 memcpy (new_buf, p->rec_buf, p->rec_len);
805 p->rec_buf = new_buf;
810 if (!(p->rec_buf = data1_nodetogr(p->dh, node, selected,
812 p->diagnostic = 238; /* not available in requested syntax */
814 p->rec_len = (size_t) (-1);
817 if (!(p->rec_buf = data1_nodetoexplain(p->dh, node, selected,
821 p->rec_len = (size_t) (-1);
824 if (!(p->rec_buf = data1_nodetosummary(p->dh, node, selected,
828 p->rec_len = (size_t) (-1);
832 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
833 if (!(p->rec_buf = data1_nodetobuf(p->dh, node, selected,
838 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
839 memcpy (new_buf, p->rec_buf, p->rec_len);
840 p->rec_buf = new_buf;
844 if (!(p->rec_buf = data1_nodetosoif(p->dh, node, selected,
849 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
850 memcpy (new_buf, p->rec_buf, p->rec_len);
851 p->rec_buf = new_buf;
855 if (!node->u.root.absyn)
860 for (marctab = node->u.root.absyn->marc; marctab;
861 marctab = marctab->next)
862 if (marctab->reference == p->input_format)
870 data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
871 if (!(p->rec_buf = data1_nodetomarc(p->dh, marctab, node,
872 selected, &p->rec_len)))
876 char *new_buf = (char*) odr_malloc (p->odr, p->rec_len);
877 memcpy (new_buf, p->rec_buf, p->rec_len);
878 p->rec_buf = new_buf;
882 data1_free_tree(p->dh, node);
884 data1_free_tree(p->dh, onode);
889 static struct recType grs_type =
898 RecType recTypeGrs = &grs_type;