2 * Copyright (C) 1994-2001, Index Data
6 * Revision 1.35 2001-03-29 21:31:31 adam
7 * Fixed "record begin" for Tcl filter.
9 * Revision 1.34 2000/11/29 14:24:01 adam
10 * Script configure uses yaz pthreads options. Added locking for
11 * zebra_register_{lock,unlock}.
13 * Revision 1.33 1999/11/30 13:48:04 adam
14 * Improved installation. Updated for inclusion of YAZ header files.
16 * Revision 1.32 1999/09/07 07:19:21 adam
17 * Work on character mapping. Implemented replace rules.
19 * Revision 1.31 1999/07/14 13:05:29 adam
20 * Tcl filter works with objects when TCL is version 8 or later; filter
21 * works with strings otherwise (slow).
23 * Revision 1.30 1999/07/14 10:55:28 adam
26 * Revision 1.29 1999/07/12 07:27:54 adam
27 * Improved speed of Tcl processing. Fixed one memory leak.
29 * Revision 1.28 1999/07/06 12:26:04 adam
30 * Fixed filters so that MS-DOS CR is ignored.
32 * Revision 1.27 1999/06/28 13:25:40 quinn
33 * Improved diagnostics for Tcl
35 * Revision 1.26 1999/05/26 07:49:14 adam
38 * Revision 1.25 1999/05/25 12:33:32 adam
39 * Fixed bug in Tcl filter.
41 * Revision 1.24 1999/05/21 11:08:46 adam
42 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
43 * script so that it reads uninstalled Tcl source.
45 * Revision 1.23 1999/05/20 12:57:18 adam
46 * Implemented TCL filter. Updated recctrl system.
48 * Revision 1.22 1998/11/03 16:07:13 adam
51 * Revision 1.21 1998/11/03 15:43:39 adam
52 * Fixed bug introduced by previous commit.
54 * Revision 1.20 1998/11/03 14:51:28 adam
55 * Changed code so that it creates as few data1 nodes as possible.
57 * Revision 1.19 1998/11/03 10:22:39 adam
58 * Fixed memory leak that could occur for when large data1 node were
59 * concatenated. Data-type data1_nodes may have multiple nodes.
61 * Revision 1.18 1998/10/15 13:11:47 adam
62 * Added support for option -record for "end element". When specified
63 * end element will mark end-of-record when at outer-level.
65 * Revision 1.17 1998/07/01 10:13:51 adam
68 * Revision 1.16 1998/06/30 15:15:09 adam
69 * Tags are trimmed: white space removed before- and after the tag.
71 * Revision 1.15 1998/06/30 12:55:45 adam
74 * Revision 1.14 1998/03/05 08:41:00 adam
75 * Implemented rule contexts.
77 * Revision 1.13 1997/12/12 06:33:58 adam
78 * Fixed bug that showed up when multiple filter where used.
79 * Made one routine thread-safe.
81 * Revision 1.12 1997/11/18 10:03:24 adam
82 * Member num_children removed from data1_node.
84 * Revision 1.11 1997/11/06 11:41:01 adam
85 * Implemented "begin variant" for the sgml.regx filter.
87 * Revision 1.10 1997/10/31 12:36:12 adam
88 * Minor change that avoids compiler warning.
90 * Revision 1.9 1997/09/29 09:02:49 adam
91 * Fixed small bug (introduced by previous commit).
93 * Revision 1.8 1997/09/17 12:19:22 adam
94 * Zebra version corresponds to YAZ version 1.4.
95 * Changed Zebra server so that it doesn't depend on global common_resource.
97 * Revision 1.7 1997/07/15 16:33:07 adam
98 * Check for zero length in execData.
100 * Revision 1.6 1997/02/24 10:41:51 adam
101 * Cleanup of code and commented out the "end element-end-record" code.
103 * Revision 1.5 1997/02/19 16:22:33 adam
104 * Fixed "end element" to terminate record in outer-most level.
106 * Revision 1.4 1997/02/12 20:42:58 adam
107 * Changed some log messages.
109 * Revision 1.3 1996/11/08 14:05:33 adam
110 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
112 * Revision 1.2 1996/10/29 14:02:09 adam
113 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
114 * data1_get_tabpath is used.
116 * Revision 1.1 1996/10/11 10:57:30 adam
117 * New module recctrl. Used to manage records (extract/retrieval).
119 * Revision 1.24 1996/06/17 14:25:31 adam
120 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
122 * Revision 1.23 1996/06/04 10:19:00 adam
123 * Minor changes - removed include of ctype.h.
125 * Revision 1.22 1996/06/03 15:23:13 adam
126 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
128 * Revision 1.21 1996/05/14 16:58:38 adam
131 * Revision 1.20 1996/05/01 13:46:36 adam
132 * First work on multiple records in one file.
133 * New option, -offset, to the "unread" command in the filter module.
135 * Revision 1.19 1996/02/12 16:18:20 adam
136 * Yet another bug fix in implementation of unread command.
138 * Revision 1.18 1996/02/12 16:07:54 adam
139 * Bug fix in new unread command.
141 * Revision 1.17 1996/02/12 15:56:11 adam
142 * New code command: unread.
144 * Revision 1.16 1996/01/17 14:57:51 adam
145 * Prototype changed for reader functions in extract/retrieve. File
146 * is identified by 'void *' instead of 'int.
148 * Revision 1.15 1996/01/08 19:15:47 adam
149 * New input filter that works!
151 * Revision 1.14 1996/01/08 09:10:38 adam
152 * Yet another complete rework on this module.
154 * Revision 1.13 1995/12/15 17:21:50 adam
155 * This version is able to set data.formatted_text in data1-nodes.
157 * Revision 1.12 1995/12/15 16:20:10 adam
158 * The filter files (*.flt) are read from the path given by data1_tabpath.
160 * Revision 1.11 1995/12/15 12:35:16 adam
163 * Revision 1.10 1995/12/15 10:35:36 adam
166 * Revision 1.9 1995/12/14 16:38:48 adam
167 * Completely new attempt to make regular expression parsing.
169 * Revision 1.8 1995/12/13 17:16:59 adam
172 * Revision 1.7 1995/12/13 16:51:58 adam
173 * Modified to set last_child in data1_nodes.
174 * Uses destroy handler to free up data text nodes.
176 * Revision 1.6 1995/12/13 13:45:37 quinn
177 * Changed data1 to use nmem.
179 * Revision 1.5 1995/12/11 09:12:52 adam
180 * The rec_get function returns NULL if record doesn't exist - will
181 * happen in the server if the result set records have been deleted since
182 * the creation of the set (i.e. the search).
183 * The server saves a result temporarily if it is 'volatile', i.e. the
184 * set is register dependent.
186 * Revision 1.4 1995/12/05 16:57:40 adam
187 * More work on regular patterns.
189 * Revision 1.3 1995/12/05 09:37:09 adam
190 * One malloc was renamed to xmalloc.
192 * Revision 1.2 1995/12/04 17:59:24 adam
193 * More work on regular expression conversion.
195 * Revision 1.1 1995/12/04 14:25:30 adam
196 * Started work on regular expression parsed input to structured records.
204 #include <yaz/tpath.h>
205 #include <zebrautl.h>
212 #if MAJOR_VERSION >= 8
213 #define HAVE_TCL_OBJECTS
219 #define F_WIN_EOF 2000000000
223 #define REGX_PATTERN 1
228 #define REGX_CONTEXT 6
238 struct lexRuleAction {
242 struct DFA *dfa; /* REGX_PATTERN */
245 struct regxCode *code; /* REGX_CODE */
247 struct lexRuleAction *next;
252 struct lexRuleAction *actionList;
256 struct lexRuleInfo info;
257 struct lexRule *next;
263 struct lexRule *rules;
264 struct lexRuleInfo **fastRule;
268 struct lexRuleAction *beginActionList;
269 struct lexRuleAction *endActionList;
270 struct lexRuleAction *initActionList;
271 struct lexContext *next;
274 struct lexConcatBuf {
281 struct lexContext *context;
283 struct lexContext **context_stack;
284 int context_stack_size;
285 int context_stack_top;
291 Tcl_Interp *tcl_interp;
294 void (*f_win_ef)(void *, off_t);
296 int f_win_start; /* first byte of buffer is this file offset */
297 int f_win_end; /* last byte of buffer is this offset - 1 */
298 int f_win_size; /* size of buffer */
299 char *f_win_buf; /* buffer itself */
300 int (*f_win_rf)(void *, char *, size_t);
301 off_t (*f_win_sf)(void *, off_t);
303 struct lexConcatBuf *concatBuf;
305 data1_node **d1_stack;
316 struct lexSpec *spec;
319 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
322 int i, r, off = start_pos - spec->f_win_start;
324 if (off >= 0 && end_pos <= spec->f_win_end)
326 *size = end_pos - start_pos;
327 return spec->f_win_buf + off;
329 if (off < 0 || start_pos >= spec->f_win_end)
331 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
332 spec->f_win_start = start_pos;
334 if (!spec->f_win_buf)
335 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
336 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
338 spec->f_win_end = spec->f_win_start + *size;
340 if (*size > end_pos - start_pos)
341 *size = end_pos - start_pos;
342 return spec->f_win_buf;
344 for (i = 0; i<spec->f_win_end - start_pos; i++)
345 spec->f_win_buf[i] = spec->f_win_buf[i + off];
346 r = (*spec->f_win_rf)(spec->f_win_fh,
348 spec->f_win_size - i);
349 spec->f_win_start = start_pos;
350 spec->f_win_end += r;
352 if (*size > end_pos - start_pos)
353 *size = end_pos - start_pos;
354 return spec->f_win_buf;
357 static int f_win_advance (struct lexSpec *spec, int *pos)
362 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
363 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
364 if (*pos == F_WIN_EOF)
366 buf = f_win_get (spec, *pos, *pos+1, &size);
376 static void regxCodeDel (struct regxCode **pp)
378 struct regxCode *p = *pp;
383 Tcl_DecrRefCount (p->tcl_obj);
391 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
395 p = (struct regxCode *) xmalloc (sizeof(*p));
396 p->str = (char *) xmalloc (len+1);
397 memcpy (p->str, buf, len);
400 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
402 Tcl_IncrRefCount (p->tcl_obj);
407 static struct DFA *lexSpecDFA (void)
412 dfa_parse_cmap_del (dfa, ' ');
413 dfa_parse_cmap_del (dfa, '\t');
414 dfa_parse_cmap_add (dfa, '/', 0);
418 static void actionListDel (struct lexRuleAction **rap)
420 struct lexRuleAction *ra1, *ra;
422 for (ra = *rap; ra; ra = ra1)
428 dfa_delete (&ra->u.pattern.dfa);
431 regxCodeDel (&ra->u.code);
439 static struct lexContext *lexContextCreate (const char *name)
441 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
443 p->name = xstrdup (name);
446 p->dfa = lexSpecDFA ();
449 p->beginActionList = NULL;
450 p->endActionList = NULL;
451 p->initActionList = NULL;
456 static void lexContextDestroy (struct lexContext *p)
458 struct lexRule *rp, *rp1;
460 dfa_delete (&p->dfa);
462 for (rp = p->rules; rp; rp = rp1)
465 actionListDel (&rp->info.actionList);
468 actionListDel (&p->beginActionList);
469 actionListDel (&p->endActionList);
470 actionListDel (&p->initActionList);
475 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
480 p = (struct lexSpec *) xmalloc (sizeof(*p));
481 p->name = (char *) xmalloc (strlen(name)+1);
482 strcpy (p->name, name);
489 p->context_stack_size = 100;
490 p->context_stack = (struct lexContext **)
491 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
495 p->concatBuf = (struct lexConcatBuf *)
496 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
497 for (i = 0; i < p->maxLevel; i++)
499 p->concatBuf[i].max = 0;
500 p->concatBuf[i].buf = 0;
502 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
507 static void lexSpecDestroy (struct lexSpec **pp)
510 struct lexContext *lt;
518 for (i = 0; i < p->maxLevel; i++)
519 xfree (p->concatBuf[i].buf);
520 xfree (p->concatBuf);
525 struct lexContext *lt_next = lt->next;
526 lexContextDestroy (lt);
531 Tcl_DeleteInterp (p->tcl_interp);
534 xfree (p->f_win_buf);
535 xfree (p->context_stack);
541 static int readParseToken (const char **cpp, int *len)
543 const char *cp = *cpp;
547 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
576 if (*cp >= 'a' && *cp <= 'z')
578 else if (*cp >= 'A' && *cp <= 'Z')
579 cmd[i] = *cp + 'a' - 'A';
582 if (i < (int) sizeof(cmd)-2)
589 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
591 while (*cp && *cp != ' ' && *cp != '\t' &&
592 *cp != '\n' && *cp != '\r')
598 if (!strcmp (cmd, "begin"))
600 else if (!strcmp (cmd, "end"))
602 else if (!strcmp (cmd, "body"))
604 else if (!strcmp (cmd, "context"))
606 else if (!strcmp (cmd, "init"))
610 logf (LOG_WARN, "bad command %s", cmd);
616 static int actionListMk (struct lexSpec *spec, const char *s,
617 struct lexRuleAction **ap)
623 while ((tok = readParseToken (&s, &len)))
631 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
633 regxCodeMk (&(*ap)->u.code, s, len);
637 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
639 (*ap)->u.pattern.body = bodyMark;
641 (*ap)->u.pattern.dfa = lexSpecDFA ();
643 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
648 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
651 dfa_mkstate ((*ap)->u.pattern.dfa);
655 logf (LOG_WARN, "cannot use BEGIN here");
658 logf (LOG_WARN, "cannot use INIT here");
661 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
671 int readOneSpec (struct lexSpec *spec, const char *s)
675 struct lexContext *lc;
677 tok = readParseToken (&s, &len);
678 if (tok == REGX_CONTEXT)
680 char context_name[32];
681 tok = readParseToken (&s, &len);
682 if (tok != REGX_CODE)
684 logf (LOG_WARN, "missing name after CONTEXT keyword");
689 memcpy (context_name, s, len);
690 context_name[len] = '\0';
691 lc = lexContextCreate (context_name);
692 lc->next = spec->context;
697 spec->context = lexContextCreate ("main");
702 actionListDel (&spec->context->beginActionList);
703 actionListMk (spec, s, &spec->context->beginActionList);
706 actionListDel (&spec->context->endActionList);
707 actionListMk (spec, s, &spec->context->endActionList);
710 actionListDel (&spec->context->initActionList);
711 actionListMk (spec, s, &spec->context->initActionList);
715 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
717 r = dfa_parse (spec->context->dfa, &s);
720 logf (LOG_WARN, "regular expression error. r=%d", r);
725 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
729 rp = (struct lexRule *) xmalloc (sizeof(*rp));
730 rp->info.no = spec->context->ruleNo++;
731 rp->next = spec->context->rules;
732 spec->context->rules = rp;
733 actionListMk (spec, s, &rp->info.actionList);
738 int readFileSpec (struct lexSpec *spec)
740 struct lexContext *lc;
741 int c, i, errors = 0;
747 if (spec->tcl_interp)
749 sprintf (fname, "%s.tflt", spec->name);
750 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
755 sprintf (fname, "%s.flt", spec->name);
756 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
760 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
763 logf (LOG_LOG, "reading regx filter %s", fname);
765 if (spec->tcl_interp)
766 logf (LOG_LOG, "Tcl enabled");
768 lineBuf = wrbuf_alloc();
773 wrbuf_rewind (lineBuf);
774 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
776 while (c != '\n' && c != EOF)
789 wrbuf_putc(lineBuf, c);
797 if (c != ' ' && c != '\t')
802 wrbuf_putc(lineBuf, '\0');
803 readOneSpec (spec, wrbuf_buf(lineBuf));
804 spec->lineNo += addLine;
808 wrbuf_free(lineBuf, 1);
813 debug_dfa_followpos = 1;
816 for (lc = spec->context; lc; lc = lc->next)
819 lc->fastRule = (struct lexRuleInfo **)
820 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
821 for (i = 0; i < lc->ruleNo; i++)
822 lc->fastRule[i] = NULL;
823 for (rp = lc->rules; rp; rp = rp->next)
824 lc->fastRule[rp->info.no] = &rp->info;
825 dfa_mkstate (lc->dfa);
834 static struct lexSpec *curLexSpec = NULL;
837 static void execData (struct lexSpec *spec,
838 const char *ebuf, int elen, int formatted_text)
840 struct data1_node *res, *parent;
843 if (elen == 0) /* shouldn't happen, but it does! */
847 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
848 ebuf, 15, ebuf + elen-15);
850 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
852 logf (LOG_LOG, "data (%d bytes)", elen);
855 if (spec->d1_level <= 1)
858 parent = spec->d1_stack[spec->d1_level -1];
861 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
862 org_len = res->u.data.len;
867 res = data1_mk_node (spec->dh, spec->m);
868 res->parent = parent;
869 res->which = DATA1N_data;
870 res->u.data.what = DATA1I_text;
872 res->u.data.formatted_text = formatted_text;
874 if (elen > DATA1_LOCALDATA)
875 res->u.data.data = nmem_malloc (spec->m, elen);
877 res->u.data.data = res->lbuf;
878 memcpy (res->u.data.data, ebuf, elen);
880 res->u.data.data = 0;
882 res->root = parent->root;
884 parent->last_child = res;
885 if (spec->d1_stack[spec->d1_level])
886 spec->d1_stack[spec->d1_level]->next = res;
889 spec->d1_stack[spec->d1_level] = res;
891 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
893 char *old_buf, *new_buf;
895 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
896 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
897 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
899 memcpy (new_buf, old_buf, org_len);
902 spec->concatBuf[spec->d1_level].buf = new_buf;
904 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
905 res->u.data.len += elen;
908 static void execDataP (struct lexSpec *spec,
909 const char *ebuf, int elen, int formatted_text)
911 execData (spec, ebuf, elen, formatted_text);
914 static void tagDataRelease (struct lexSpec *spec)
918 if ((res = spec->d1_stack[spec->d1_level]) &&
919 res->which == DATA1N_data &&
920 res->u.data.what == DATA1I_text)
922 assert (!res->u.data.data);
923 assert (res->u.data.len > 0);
924 if (res->u.data.len > DATA1_LOCALDATA)
925 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
927 res->u.data.data = res->lbuf;
928 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
933 static void variantBegin (struct lexSpec *spec,
934 const char *class_str, int class_len,
935 const char *type_str, int type_len,
936 const char *value_str, int value_len)
938 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
939 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
944 if (spec->d1_level == 0)
946 logf (LOG_WARN, "in variant begin. No record type defined");
949 if (class_len >= DATA1_MAX_SYMBOL)
950 class_len = DATA1_MAX_SYMBOL-1;
951 memcpy (tclass, class_str, class_len);
952 tclass[class_len] = '\0';
954 if (type_len >= DATA1_MAX_SYMBOL)
955 type_len = DATA1_MAX_SYMBOL-1;
956 memcpy (ttype, type_str, type_len);
957 ttype[type_len] = '\0';
960 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
965 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
969 if (parent->which != DATA1N_variant)
971 res = data1_mk_node (spec->dh, spec->m);
972 res->parent = parent;
973 res->which = DATA1N_variant;
974 res->u.variant.type = 0;
975 res->u.variant.value = 0;
976 res->root = parent->root;
978 parent->last_child = res;
979 if (spec->d1_stack[spec->d1_level])
981 tagDataRelease (spec);
982 spec->d1_stack[spec->d1_level]->next = res;
986 spec->d1_stack[spec->d1_level] = res;
987 spec->d1_stack[++(spec->d1_level)] = NULL;
989 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
990 if (spec->d1_stack[i]->u.variant.type == tp)
997 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
999 parent = spec->d1_stack[spec->d1_level-1];
1000 res = data1_mk_node (spec->dh, spec->m);
1001 res->parent = parent;
1002 res->which = DATA1N_variant;
1003 res->root = parent->root;
1004 res->u.variant.type = tp;
1006 if (value_len >= DATA1_LOCALDATA)
1007 value_len =DATA1_LOCALDATA-1;
1008 memcpy (res->lbuf, value_str, value_len);
1009 res->lbuf[value_len] = '\0';
1011 res->u.variant.value = res->lbuf;
1013 parent->last_child = res;
1014 if (spec->d1_stack[spec->d1_level])
1016 tagDataRelease (spec);
1017 spec->d1_stack[spec->d1_level]->next = res;
1020 parent->child = res;
1021 spec->d1_stack[spec->d1_level] = res;
1022 spec->d1_stack[++(spec->d1_level)] = NULL;
1025 static void tagStrip (const char **tag, int *len)
1029 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1032 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1038 static void tagBegin (struct lexSpec *spec,
1039 const char *tag, int len)
1041 struct data1_node *parent;
1042 data1_element *elem = NULL;
1045 data1_element *e = NULL;
1048 if (spec->d1_level == 0)
1050 logf (LOG_WARN, "in element begin. No record type defined");
1053 tagStrip (&tag, &len);
1055 parent = spec->d1_stack[spec->d1_level -1];
1056 partag = get_parent_tag(spec->dh, parent);
1058 res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
1059 res->parent = parent;
1061 if (len >= DATA1_LOCALDATA)
1062 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1064 res->u.tag.tag = res->lbuf;
1066 memcpy (res->u.tag.tag, tag, len);
1067 res->u.tag.tag[len] = '\0';
1070 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1072 if (parent->which == DATA1N_variant)
1075 if (!(e = partag->u.tag.element))
1078 elem = data1_getelementbytagname (spec->dh,
1079 spec->d1_stack[0]->u.root.absyn,
1081 res->u.tag.element = elem;
1082 res->root = parent->root;
1084 parent->last_child = res;
1085 if (spec->d1_stack[spec->d1_level])
1087 tagDataRelease (spec);
1088 spec->d1_stack[spec->d1_level]->next = res;
1091 parent->child = res;
1092 spec->d1_stack[spec->d1_level] = res;
1093 spec->d1_stack[++(spec->d1_level)] = NULL;
1096 static void tagEnd (struct lexSpec *spec, int min_level,
1097 const char *tag, int len)
1099 tagStrip (&tag, &len);
1100 while (spec->d1_level > min_level)
1102 tagDataRelease (spec);
1104 if (spec->d1_level == 0)
1106 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1108 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1110 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1114 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
1119 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1122 struct DFA_state *state = dfa->states[0];
1125 unsigned char c_prev = 0;
1126 int ptr = *pptr; /* current pointer */
1127 int start_ptr = *pptr; /* first char of match */
1128 int last_ptr = 0; /* last char of match */
1129 int last_rule = 0; /* rule number of current match */
1134 c = f_win_advance (spec, &ptr);
1135 if (ptr == F_WIN_EOF)
1152 *mptr = start_ptr; /* match starts here */
1153 *pptr = last_ptr; /* match end here (+1) */
1156 state = dfa->states[0];
1161 else if (c >= t->ch[0] && c <= t->ch[1])
1163 state = dfa->states[t->to];
1168 last_rule = state->rule_no;
1173 last_rule = state->rule_nno;
1185 static int execTok (struct lexSpec *spec, const char **src,
1186 const char **tokBuf, int *tokLen)
1188 const char *s = *src;
1190 while (*s == ' ' || *s == '\t')
1194 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1198 while (*s >= '0' && *s <= '9')
1199 n = n*10 + (*s++ -'0');
1200 if (spec->arg_no == 0)
1207 if (n >= spec->arg_no)
1209 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1213 else if (*s == '\"')
1216 while (*s && *s != '\"')
1218 *tokLen = s - *tokBuf;
1223 else if (*s == '\n' || *s == ';')
1231 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1234 *tokLen = s - *tokBuf;
1241 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1244 *tokLen = s - *tokBuf;
1250 static char *regxStrz (const char *src, int len, char *str)
1254 memcpy (str, src, len);
1260 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1261 int argc, char **argv)
1263 struct lexSpec *spec = (struct lexSpec *) clientData;
1266 if (!strcmp(argv[1], "record") && argc == 3)
1268 char *absynName = argv[2];
1272 logf (LOG_LOG, "begin record %s", absynName);
1274 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1275 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1280 res = data1_mk_node (spec->dh, spec->m);
1281 res->which = DATA1N_root;
1283 data1_insert_string(spec->dh, res, spec->m, absynName);
1284 res->u.root.absyn = absyn;
1287 spec->d1_stack[spec->d1_level] = res;
1288 spec->d1_stack[++(spec->d1_level)] = NULL;
1291 else if (!strcmp(argv[1], "element") && argc == 3)
1293 tagBegin (spec, argv[2], strlen(argv[2]));
1295 else if (!strcmp (argv[1], "variant") && argc == 5)
1297 variantBegin (spec, argv[2], strlen(argv[2]),
1298 argv[3], strlen(argv[3]),
1299 argv[4], strlen(argv[4]));
1301 else if (!strcmp (argv[1], "context") && argc == 3)
1303 struct lexContext *lc = spec->context;
1305 logf (LOG_LOG, "begin context %s",argv[2]);
1307 while (lc && strcmp (argv[2], lc->name))
1311 spec->context_stack[++(spec->context_stack_top)] = lc;
1314 logf (LOG_WARN, "unknown context %s", argv[2]);
1321 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1322 int argc, char **argv)
1324 struct lexSpec *spec = (struct lexSpec *) clientData;
1328 if (!strcmp (argv[1], "record"))
1330 while (spec->d1_level)
1332 tagDataRelease (spec);
1336 logf (LOG_LOG, "end record");
1338 spec->stop_flag = 1;
1340 else if (!strcmp (argv[1], "element"))
1344 if (argc >= 3 && !strcmp(argv[2], "-record"))
1353 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1354 if (spec->d1_level == 0)
1357 logf (LOG_LOG, "end element end records");
1359 spec->stop_flag = 1;
1362 else if (!strcmp (argv[1], "context"))
1365 logf (LOG_LOG, "end context");
1367 if (spec->context_stack_top)
1368 (spec->context_stack_top)--;
1375 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1376 int argc, char **argv)
1380 const char *element = 0;
1381 struct lexSpec *spec = (struct lexSpec *) clientData;
1385 if (!strcmp("-text", argv[argi]))
1390 else if (!strcmp("-element", argv[argi]))
1394 element = argv[argi++];
1400 tagBegin (spec, element, strlen(element));
1404 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1408 tagEnd (spec, 1, NULL, 0);
1412 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1413 int argc, char **argv)
1415 struct lexSpec *spec = (struct lexSpec *) clientData;
1422 if (!strcmp("-offset", argv[argi]))
1427 offset = atoi(argv[argi]);
1436 no = atoi(argv[argi]);
1437 if (no >= spec->arg_no)
1438 no = spec->arg_no - 1;
1439 spec->ptr = spec->arg_start[no] + offset;
1443 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1447 for (i = 0; i < spec->arg_no; i++)
1449 char var_name[10], *var_buf;
1452 sprintf (var_name, "%d", i);
1453 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1457 ch = var_buf[var_len];
1458 var_buf[var_len] = '\0';
1459 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1460 var_buf[var_len] = ch;
1463 #if HAVE_TCL_OBJECTS
1464 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1466 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1470 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1471 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1472 spec->tcl_interp->errorLine,
1473 spec->tcl_interp->result,
1474 err ? err : "[NO ERRORINFO]");
1480 static void execCode (struct lexSpec *spec, struct regxCode *code)
1482 const char *s = code->str;
1484 const char *cmd_str;
1486 r = execTok (spec, &s, &cmd_str, &cmd_len);
1493 r = execTok (spec, &s, &cmd_str, &cmd_len);
1496 p = regxStrz (cmd_str, cmd_len, ptmp);
1497 if (!strcmp (p, "begin"))
1499 r = execTok (spec, &s, &cmd_str, &cmd_len);
1502 logf (LOG_WARN, "missing keyword after 'begin'");
1505 p = regxStrz (cmd_str, cmd_len, ptmp);
1506 if (!strcmp (p, "record"))
1508 r = execTok (spec, &s, &cmd_str, &cmd_len);
1511 if (spec->d1_level == 0)
1513 static char absynName[64];
1518 memcpy (absynName, cmd_str, cmd_len);
1519 absynName[cmd_len] = '\0';
1522 logf (LOG_LOG, "begin record %s", absynName);
1524 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1525 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1530 res = data1_mk_node (spec->dh, spec->m);
1531 res->which = DATA1N_root;
1532 res->u.root.type = absynName;
1533 res->u.root.absyn = absyn;
1536 spec->d1_stack[spec->d1_level] = res;
1537 spec->d1_stack[++(spec->d1_level)] = NULL;
1540 r = execTok (spec, &s, &cmd_str, &cmd_len);
1542 else if (!strcmp (p, "element"))
1544 r = execTok (spec, &s, &cmd_str, &cmd_len);
1547 tagBegin (spec, cmd_str, cmd_len);
1548 r = execTok (spec, &s, &cmd_str, &cmd_len);
1550 else if (!strcmp (p, "variant"))
1553 const char *class_str = NULL;
1555 const char *type_str = NULL;
1557 const char *value_str = NULL;
1558 r = execTok (spec, &s, &cmd_str, &cmd_len);
1561 class_str = cmd_str;
1562 class_len = cmd_len;
1563 r = execTok (spec, &s, &cmd_str, &cmd_len);
1569 r = execTok (spec, &s, &cmd_str, &cmd_len);
1572 value_str = cmd_str;
1573 value_len = cmd_len;
1575 variantBegin (spec, class_str, class_len,
1576 type_str, type_len, value_str, value_len);
1579 r = execTok (spec, &s, &cmd_str, &cmd_len);
1581 else if (!strcmp (p, "context"))
1585 struct lexContext *lc = spec->context;
1586 r = execTok (spec, &s, &cmd_str, &cmd_len);
1587 p = regxStrz (cmd_str, cmd_len, ptmp);
1589 logf (LOG_LOG, "begin context %s", p);
1591 while (lc && strcmp (p, lc->name))
1594 spec->context_stack[++(spec->context_stack_top)] = lc;
1596 logf (LOG_WARN, "unknown context %s", p);
1599 r = execTok (spec, &s, &cmd_str, &cmd_len);
1603 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1606 else if (!strcmp (p, "end"))
1608 r = execTok (spec, &s, &cmd_str, &cmd_len);
1611 logf (LOG_WARN, "missing keyword after 'end'");
1614 p = regxStrz (cmd_str, cmd_len, ptmp);
1615 if (!strcmp (p, "record"))
1617 while (spec->d1_level)
1619 tagDataRelease (spec);
1622 r = execTok (spec, &s, &cmd_str, &cmd_len);
1624 logf (LOG_LOG, "end record");
1626 spec->stop_flag = 1;
1628 else if (!strcmp (p, "element"))
1631 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1633 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1638 tagEnd (spec, min_level, cmd_str, cmd_len);
1639 r = execTok (spec, &s, &cmd_str, &cmd_len);
1642 tagEnd (spec, min_level, NULL, 0);
1643 if (spec->d1_level == 0)
1646 logf (LOG_LOG, "end element end records");
1648 spec->stop_flag = 1;
1652 else if (!strcmp (p, "context"))
1655 logf (LOG_LOG, "end context");
1657 if (spec->context_stack_top)
1658 (spec->context_stack_top)--;
1659 r = execTok (spec, &s, &cmd_str, &cmd_len);
1662 logf (LOG_WARN, "bad keyword '%s' after end", p);
1664 else if (!strcmp (p, "data"))
1668 const char *element_str = NULL;
1670 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1672 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1674 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1676 r = execTok (spec, &s, &element_str, &element_len);
1681 logf (LOG_WARN, "bad data option: %.*s",
1686 logf (LOG_WARN, "missing data item after data");
1690 tagBegin (spec, element_str, element_len);
1693 execData (spec, cmd_str, cmd_len,textFlag);
1694 r = execTok (spec, &s, &cmd_str, &cmd_len);
1697 tagEnd (spec, 1, NULL, 0);
1699 else if (!strcmp (p, "unread"))
1702 r = execTok (spec, &s, &cmd_str, &cmd_len);
1703 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1705 r = execTok (spec, &s, &cmd_str, &cmd_len);
1708 logf (LOG_WARN, "missing number after -offset");
1711 p = regxStrz (cmd_str, cmd_len, ptmp);
1713 r = execTok (spec, &s, &cmd_str, &cmd_len);
1719 logf (LOG_WARN, "missing index after unread command");
1722 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1724 logf (LOG_WARN, "bad index after unread command");
1729 no = *cmd_str - '0';
1730 if (no >= spec->arg_no)
1731 no = spec->arg_no - 1;
1732 spec->ptr = spec->arg_start[no] + offset;
1734 r = execTok (spec, &s, &cmd_str, &cmd_len);
1736 else if (!strcmp (p, "context"))
1740 struct lexContext *lc = spec->context;
1741 r = execTok (spec, &s, &cmd_str, &cmd_len);
1742 p = regxStrz (cmd_str, cmd_len, ptmp);
1744 while (lc && strcmp (p, lc->name))
1747 spec->context_stack[spec->context_stack_top] = lc;
1749 logf (LOG_WARN, "unknown context %s", p);
1752 r = execTok (spec, &s, &cmd_str, &cmd_len);
1756 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1757 r = execTok (spec, &s, &cmd_str, &cmd_len);
1762 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1764 r = execTok (spec, &s, &cmd_str, &cmd_len);
1771 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1772 int start_ptr, int *pptr)
1781 arg_start[0] = start_ptr;
1783 spec->arg_start = arg_start;
1784 spec->arg_end = arg_end;
1791 if (ap->u.pattern.body)
1793 arg_start[arg_no] = *pptr;
1794 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1796 arg_end[arg_no] = F_WIN_EOF;
1798 arg_start[arg_no] = F_WIN_EOF;
1799 arg_end[arg_no] = F_WIN_EOF;
1804 arg_end[arg_no] = sptr;
1806 arg_start[arg_no] = sptr;
1807 arg_end[arg_no] = *pptr;
1812 arg_start[arg_no] = *pptr;
1813 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1815 if (sptr != arg_start[arg_no])
1817 arg_end[arg_no] = *pptr;
1822 spec->arg_no = arg_no;
1825 if (spec->tcl_interp)
1826 execTcl(spec, ap->u.code);
1828 execCode (spec, ap->u.code);
1830 execCode (spec, ap->u.code);
1833 if (spec->stop_flag)
1837 arg_start[arg_no] = *pptr;
1838 arg_end[arg_no] = F_WIN_EOF;
1847 static int execRule (struct lexSpec *spec, struct lexContext *context,
1848 int ruleNo, int start_ptr, int *pptr)
1851 logf (LOG_LOG, "exec rule %d", ruleNo);
1853 return execAction (spec, context->fastRule[ruleNo]->actionList,
1857 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1859 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1860 struct DFA_state *state = context->dfa->states[0];
1863 unsigned char c_prev = '\n';
1865 int last_rule = 0; /* rule number of current match */
1866 int last_ptr = *ptr; /* last char of match */
1867 int start_ptr = *ptr; /* first char of match */
1868 int skip_ptr = *ptr; /* first char of run */
1872 c = f_win_advance (spec, ptr);
1873 if (*ptr == F_WIN_EOF)
1875 /* end of file met */
1878 /* there was a match */
1879 if (skip_ptr < start_ptr)
1881 /* deal with chars that didn't match */
1884 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1885 execDataP (spec, buf, size, 0);
1887 /* restore pointer */
1890 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1892 /* restore skip pointer */
1896 else if (skip_ptr < *ptr)
1898 /* deal with chars that didn't match */
1901 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1902 execDataP (spec, buf, size, 0);
1904 if (*ptr == F_WIN_EOF)
1911 { /* no transition for character c ... */
1914 if (skip_ptr < start_ptr)
1916 /* deal with chars that didn't match */
1919 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1920 execDataP (spec, buf, size, 0);
1922 /* restore pointer */
1924 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1926 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1929 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1931 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1935 context = spec->context_stack[spec->context_stack_top];
1938 last_ptr = start_ptr = *ptr;
1942 c_prev = f_win_advance (spec, &start_ptr);
1947 c_prev = f_win_advance (spec, &start_ptr);
1950 state = context->dfa->states[0];
1953 else if (c >= t->ch[0] && c <= t->ch[1])
1954 { /* transition ... */
1955 state = context->dfa->states[t->to];
1960 last_rule = state->rule_no;
1963 else if (state->rule_nno)
1965 last_rule = state->rule_nno;
1977 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1978 const char *context_name)
1980 struct lexContext *lt = spec->context;
1983 spec->stop_flag = 0;
1985 spec->context_stack_top = 0;
1988 if (!strcmp (lt->name, context_name))
1994 logf (LOG_WARN, "cannot find context %s", context_name);
1997 spec->context_stack[spec->context_stack_top] = lt;
1998 spec->d1_stack[spec->d1_level] = NULL;
2003 execAction (spec, lt->initActionList, ptr, &ptr);
2006 execAction (spec, lt->beginActionList, ptr, &ptr);
2007 lexNode (spec, &ptr);
2008 while (spec->d1_level)
2010 tagDataRelease (spec);
2013 execAction (spec, lt->endActionList, ptr, &ptr);
2014 return spec->d1_stack[0];
2017 void grs_destroy(void *clientData)
2019 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2022 lexSpecDestroy(&specs->spec);
2027 void *grs_init(void)
2029 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2034 data1_node *grs_read_regx (struct grs_read_info *p)
2037 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2038 struct lexSpec **curLexSpec = &specs->spec;
2041 logf (LOG_LOG, "grs_read_regx");
2043 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2046 lexSpecDestroy (curLexSpec);
2047 *curLexSpec = lexSpecCreate (p->type, p->dh);
2048 res = readFileSpec (*curLexSpec);
2051 lexSpecDestroy (curLexSpec);
2055 (*curLexSpec)->dh = p->dh;
2058 (*curLexSpec)->f_win_start = 0;
2059 (*curLexSpec)->f_win_end = 0;
2060 (*curLexSpec)->f_win_rf = p->readf;
2061 (*curLexSpec)->f_win_sf = p->seekf;
2062 (*curLexSpec)->f_win_fh = p->fh;
2063 (*curLexSpec)->f_win_ef = p->endf;
2064 (*curLexSpec)->f_win_size = 500000;
2066 (*curLexSpec)->m = p->mem;
2067 return lexRoot (*curLexSpec, p->offset, "main");
2070 static struct recTypeGrs regx_type = {
2077 RecTypeGrs recTypeGrs_regx = ®x_type;
2080 data1_node *grs_read_tcl (struct grs_read_info *p)
2083 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2084 struct lexSpec **curLexSpec = &specs->spec;
2087 logf (LOG_LOG, "grs_read_tcl");
2089 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2091 Tcl_Interp *tcl_interp;
2093 lexSpecDestroy (curLexSpec);
2094 *curLexSpec = lexSpecCreate (p->type, p->dh);
2095 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2096 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2097 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2098 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2099 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2101 res = readFileSpec (*curLexSpec);
2104 lexSpecDestroy (curLexSpec);
2108 (*curLexSpec)->dh = p->dh;
2111 (*curLexSpec)->f_win_start = 0;
2112 (*curLexSpec)->f_win_end = 0;
2113 (*curLexSpec)->f_win_rf = p->readf;
2114 (*curLexSpec)->f_win_sf = p->seekf;
2115 (*curLexSpec)->f_win_fh = p->fh;
2116 (*curLexSpec)->f_win_ef = p->endf;
2117 (*curLexSpec)->f_win_size = 500000;
2119 (*curLexSpec)->m = p->mem;
2120 return lexRoot (*curLexSpec, p->offset, "main");
2123 static struct recTypeGrs tcl_type = {
2130 RecTypeGrs recTypeGrs_tcl = &tcl_type;