2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.1 1995-01-24 16:02:52 adam
8 * New private header file in dfa module (dfap.h).
9 * Module no longer uses yacc to parse regular expressions.
31 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
32 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
33 /* a character in range [ch[0]..ch[1]] */
34 /* otherwise ch[0] represents */
35 /* accepting no (-acceptno) */
37 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
38 unsigned nullable : 1;
39 Set firstpos; /* first positions */
40 Set lastpos; /* last positions */
43 struct Tblock { /* Tnode bucket (block) */
44 struct Tblock *next; /* pointer to next bucket */
45 struct Tnode *tarray; /* array of nodes in bucket */
50 int debug_dfa_trav = 0;
51 int debug_dfa_tran = 0;
52 int debug_dfa_followpos = 0;
56 static struct DFA_parse *parse_info = NULL;
58 static int inside_string;
59 static const unsigned char *expr_ptr;
60 static unsigned short *ctrl_chars;
61 static struct Tnode **posar;
64 static Set *followpos;
66 static struct Tnode *mk_Tnode (void);
67 static struct Tnode *mk_Tnode_cset (BSet charset);
68 static void term_Tnode (void);
74 mk_dfa_tran (struct DFA_states *dfas),
75 add_follow (Set lastpos, Set firstpos),
76 dfa_trav (struct Tnode *n),
77 init_followpos (void),
78 mk_dfa_tran (struct DFA_states *dfas),
79 pr_tran (struct DFA_states *dfas),
80 pr_verbose (struct DFA_states *dfas),
90 *str_char (unsigned c);
106 static int lookahead;
107 static unsigned look_ch;
108 static BSet look_chars;
110 static struct Tnode *expr_1 (void),
115 static struct Tnode *expr_1 (void)
117 struct Tnode *t1, *t2, *tn;
120 while (lookahead == L_ALT)
134 static struct Tnode *expr_2 (void)
136 struct Tnode *t1, *t2, *tn;
138 while (lookahead == L_WILD ||
139 lookahead == L_ANYZ ||
140 lookahead == L_ANY ||
142 lookahead == L_CHAR ||
143 lookahead == L_CHARS)
156 static struct Tnode *expr_3 (void)
158 struct Tnode *t1, *tn;
161 if (lookahead == L_CLOS0)
169 else if (lookahead == L_CLOS1)
177 else if (lookahead == L_QUEST)
183 tn->u.p[1] = mk_Tnode();
184 tn->u.p[1]->pos = EPSILON;
190 static struct Tnode *expr_4 (void)
198 if (lookahead == L_RP)
203 t1->pos = ++(parse_info->position);
204 t1->u.ch[1] = t1->u.ch[0] = look_ch;
208 t1 = mk_Tnode_cset (look_chars);
212 t1 = mk_Tnode_cset (parse_info->anyset);
218 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
219 t1->u.p[1] = mk_Tnode();
220 t1->u.p[1]->pos = EPSILON;
226 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
236 static int parse (dfap, s, cc)
237 struct DFA_parse *dfap;
239 const unsigned short *cc;
242 struct Tnode *t1, *t2;
244 for (i=0; cc[i]; i +=2)
246 ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
247 for (i=0; (ctrl_chars[i] = cc[i]); i ++)
251 ctrl_chars[i] = L_LP;
254 ctrl_chars[i] = L_RP;
257 ctrl_chars[i] = L_ANY;
260 ctrl_chars[i] = L_ALT;
263 ctrl_chars[i] = L_QUEST;
266 ctrl_chars[i] = L_CLOS1;
269 ctrl_chars[i] = L_CLOS0;
272 ctrl_chars[i] = L_END;
275 ctrl_chars[i] = L_START;
278 ctrl_chars[i] = L_ANY;
281 ctrl_chars[i] = L_ANYZ;
284 ctrl_chars[i] = L_WILD;
290 expr_ptr = (unsigned char *) *s;
296 t2->pos = ++parse_info->position;
297 t2->u.ch[0] = -(++parse_info->rule);
299 parse_info->top = mk_Tnode();
300 parse_info->top->pos = CAT;
301 parse_info->top->u.p[0] = t1;
302 parse_info->top->u.p[1] = t2;
304 *s = (char *) expr_ptr;
309 static int nextchar (int *esc)
312 if (*expr_ptr == '\0' || isspace(*expr_ptr))
314 else if (*expr_ptr != '\\')
341 static int read_charset (void)
343 int i, ch0, ch1, esc0, esc1, cc = 0;
344 look_chars = mk_BSet (&parse_info->charset);
345 res_BSet (parse_info->charset, look_chars);
347 ch0 = nextchar (&esc0);
348 if (!esc0 && ch0 == '^')
351 ch0 = nextchar (&esc0);
355 if (!esc0 && ch0 == ']')
357 add_BSet (parse_info->charset, look_chars, ch0);
358 ch1 = nextchar (&esc1);
359 if (!esc1 && ch1 == '-')
361 if ((ch1 = nextchar (&esc1)) == 0)
363 if (!esc1 && ch1 == ']')
365 add_BSet (parse_info->charset, look_chars, '-');
368 for (i=ch0; ++i<=ch1;)
369 add_BSet (parse_info->charset, look_chars, i);
370 ch0 = nextchar (&esc0);
379 com_BSet (parse_info->charset, look_chars);
383 unsigned short dfa_thompson_chars[] =
397 unsigned short dfa_ccl_chars[] =
405 static int lex_sub(void)
408 const unsigned short *cc;
409 while ((look_ch = nextchar (&esc)) != 0)
414 inside_string = !inside_string;
416 else if (esc || inside_string)
418 else if (look_ch == '[')
419 return read_charset();
420 else if (look_ch == ' ')
424 for (cc = ctrl_chars; *cc; cc += 2)
432 static void lex (void)
434 lookahead = lex_sub ();
437 static const char *str_char (unsigned c)
454 sprintf (s+1, "x%02x", c);
476 static void out_char (int c)
478 printf ("%s", str_char (c));
481 static void term_Tnode (void)
483 struct Tblock *t, *t1;
484 for (t = parse_info->start; (t1 = t);)
492 static struct Tnode *mk_Tnode (void)
496 if (parse_info->use_Tnode == parse_info->max_Tnode)
498 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
499 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
502 if (parse_info->use_Tnode == 0)
503 parse_info->start = tnew;
505 parse_info->end->next = tnew;
506 parse_info->end = tnew;
508 parse_info->max_Tnode += TADD;
510 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
513 struct Tnode *mk_Tnode_cset (BSet charset)
515 struct Tnode *tn1, *tn0 = mk_Tnode();
516 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
522 tn0->pos = ++parse_info->position;
523 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
525 tn0->u.ch[1] = parse_info->charset->size;
528 tn0->u.ch[1] = ch1-1;
529 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
535 tn1 = tn0->u.p[1] = mk_Tnode();
537 tn1->pos = ++(parse_info->position);
538 if ((ch1 = travi_BSet (parse_info->charset, charset,
541 tn1->u.ch[1] = parse_info->charset->size;
544 tn1->u.ch[1] = ch1-1;
551 static void del_followpos (void)
556 static void init_pos (void)
558 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
559 * (1+parse_info->position));
562 static void del_pos (void)
567 static void add_follow (Set lastpos, Set firstpos)
571 followpos[lastpos->value] =
572 union_Set (poset, followpos[lastpos->value], firstpos);
573 lastpos = lastpos->next;
577 static void dfa_trav (struct Tnode *n)
582 dfa_trav (n->u.p[0]);
583 dfa_trav (n->u.p[1]);
584 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
585 n->firstpos = mk_Set (poset);
586 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
587 if (n->u.p[0]->nullable)
588 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
589 n->lastpos = mk_Set (poset);
590 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
591 if (n->u.p[1]->nullable)
592 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
594 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
596 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
597 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
598 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
599 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
604 dfa_trav (n->u.p[0]);
605 dfa_trav (n->u.p[1]);
606 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
608 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
609 n->u.p[1]->firstpos);
610 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
612 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
613 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
614 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
615 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
620 dfa_trav (n->u.p[0]);
621 n->nullable = n->u.p[0]->nullable;
622 n->firstpos = n->u.p[0]->firstpos;
623 n->lastpos = n->u.p[0]->lastpos;
624 add_follow (n->lastpos, n->firstpos);
629 dfa_trav (n->u.p[0]);
631 n->firstpos = n->u.p[0]->firstpos;
632 n->lastpos = n->u.p[0]->lastpos;
633 add_follow (n->lastpos, n->firstpos);
639 n->lastpos = mk_Set (poset);
640 n->firstpos = mk_Set (poset);
647 n->firstpos = mk_Set (poset);
648 n->firstpos = add_Set (poset, n->firstpos, n->pos);
649 n->lastpos = mk_Set (poset);
650 n->lastpos = add_Set (poset, n->lastpos, n->pos);
653 printf ("#%d", -n->u.ch[0]);
654 else if (n->u.ch[1] > n->u.ch[0])
657 out_char (n->u.ch[0]);
658 if (n->u.ch[1] > n->u.ch[0]+1)
660 out_char (n->u.ch[1]);
664 out_char (n->u.ch[0]);
668 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
669 printf (" firstpos :");
670 pr_Set (poset, n->firstpos);
671 printf (" lastpos :");
672 pr_Set (poset, n->lastpos);
676 static void init_followpos (void)
680 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
681 for (i = parse_info->position+1; --i >= 0; fa++)
682 *fa = mk_Set (poset);
685 static void mk_dfa_tran (struct DFA_states *dfas)
692 struct DFA_state *dfa_from, *dfa_to;
695 max_char = parse_info->charset->size;
696 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
698 tran_set = cp_Set (poset, parse_info->root->firstpos);
699 i = add_DFA_state (dfas, &tran_set, &dfa_from);
701 dfa_from->rule_no = 0;
702 while ((dfa_from = get_DFA_state (dfas)))
706 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
707 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
708 *pos_i++ = tran_set->value;
709 else if (c < 0 && (i == 0 || c > i))
712 dfa_from->rule_no = -i;
714 for (char_1 = 0; char_1 <= max_char; char_1++)
717 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
718 if (posar[i]->u.ch[1] >= char_1)
719 if ((c=posar[i]->u.ch[0]) < char_0)
726 if (char_0 > max_char)
728 tran_set = mk_Set (poset);
729 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
730 if ((c=posar[i]->u.ch[1]) >= char_0)
731 if (posar[i]->u.ch[0] <= char_0)
735 tran_set = union_Set (poset, tran_set, followpos[i]);
737 else if (c <= char_1)
741 add_DFA_state (dfas, &tran_set, &dfa_to);
742 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
747 sort_DFA_states (dfas);
750 static void pr_tran (struct DFA_states *dfas)
753 struct DFA_tran *tran;
754 int prev_no, i, c, no;
756 for (no=0; no < dfas->no; ++no)
758 s = dfas->sortarray[no];
759 assert (s->no == no);
760 printf ("DFA state %d", no);
762 printf ("#%d", s->rule_no);
764 pr_Set (poset, s->set);
766 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
768 if (prev_no != tran->to)
772 printf (" goto %d on [", tran->to);
775 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
776 printf ("%s", str_char(c));
784 static void pr_verbose (struct DFA_states *dfas)
788 printf ("%d/%d tree nodes used, %d bytes each\n",
789 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
790 k = inf_BSetHandle (parse_info->charset, &i, &j);
791 printf ("%ld/%ld character sets, %d bytes each\n",
792 i/k, j/k, k*sizeof(BSetWord));
793 k = inf_SetType (poset, &i, &j);
794 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
795 printf ("%d DFA states\n", dfas->no);
798 static void pr_followpos (void)
801 printf ("\nfollowsets:\n");
802 for (i=1; i <= parse_info->position; i++)
805 pr_Set (poset, followpos[i]);
808 if (posar[i]->u.ch[0] < 0)
809 printf ("#%d", -posar[i]->u.ch[0]);
810 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
813 out_char (posar[i]->u.ch[0]);
814 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
816 out_char (posar[i]->u.ch[1]);
820 out_char (posar[i]->u.ch[0]);
826 static struct DFA_parse *dfa_parse_init (void)
828 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
830 parse_info->charset = mk_BSetHandle (255, 20);
831 parse_info->position = 0;
832 parse_info->rule = 0;
833 parse_info->root = NULL;
835 parse_info->anyset = mk_BSet (&parse_info->charset);
836 res_BSet (parse_info->charset, parse_info->anyset);
837 add_BSet (parse_info->charset, parse_info->anyset, '\n');
838 com_BSet (parse_info->charset, parse_info->anyset);
839 parse_info->use_Tnode = parse_info->max_Tnode = 0;
843 static void rm_dfa_parse (struct DFA_parse **dfap)
848 rm_BSetHandle (&parse_info->charset);
853 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
855 struct DFA_states *dfas;
857 assert (poset_chunk > 10);
861 poset = mk_SetType (poset_chunk);
864 dfa_trav (parse_info->root);
866 if (debug_dfa_followpos)
868 init_DFA_states (&dfas, poset, 401);
880 struct DFA *dfa_init (void)
884 dfa = imalloc (sizeof(*dfa));
885 dfa->parse_info = dfa_parse_init ();
886 dfa->state_info = NULL;
891 int dfa_parse (struct DFA *dfa, char **pattern)
896 i = parse (dfa->parse_info, pattern, dfa_thompson_chars);
899 if ( !dfa->parse_info->root )
900 dfa->parse_info->root = dfa->parse_info->top;
907 n->u.p[0] = dfa->parse_info->root;
908 n->u.p[1] = dfa->parse_info->top;
909 dfa->parse_info->root = n;
914 void dfa_mkstate (struct DFA *dfa)
918 dfa->state_info = mk_dfas (dfa->parse_info, 200);
919 dfa->no_states = dfa->state_info->no;
920 dfa->states = dfa->state_info->sortarray;
921 rm_dfa_parse (&dfa->parse_info);
924 void dfa_delete (struct DFA **dfap)
928 if ((*dfap)->parse_info)
929 rm_dfa_parse (&(*dfap)->parse_info);
930 rm_DFA_states (&(*dfap)->state_info);