ccl2rpn: Conversion to regexp-1 terms (trunc=102).
authorAdam Dickmeiss <adam@indexdata.dk>
Thu, 25 Aug 2011 12:06:40 +0000 (14:06 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Thu, 25 Aug 2011 12:08:29 +0000 (14:08 +0200)
For mode t=x, the CCL parser will map both # and ? to their regular
expression equivalents (. and .*).

doc/tools.xml
include/yaz/ccl.h
src/cclfind.c
src/cclqfile.c
src/ccltoken.c
test/test_ccl.c

index 7416b1d..cc3e3c5 100644 (file)
            set to both left&amp;right.
           </entry>
          </row>
+
+         <row><entry><literal>t=x</literal></entry><entry>
+           Allows masking anywhere in a term, thus fully supporting
+           # (mask one character) and ? (zero or more of any).
+           If masking is used, trunction is set to 102 (regexp-1 in term)
+           and the term is converted accordingly to a regular expression.
+          </entry>
+         </row>
+
         </tbody>
        </tgroup>
        </table>
index fa84877..d3e3032 100644 (file)
@@ -352,6 +352,7 @@ int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
 #define CCL_BIB1_TRU_CAN_RIGHT (-2)
 #define CCL_BIB1_TRU_CAN_BOTH  (-3)
 #define CCL_BIB1_TRU_CAN_NONE  (-4)
+#define CCL_BIB1_TRU_CAN_REGEX (-5)
 
 
 
index d5518df..f242169 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "cclp.h"
 
@@ -258,6 +259,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
         int len = 0;
         int left_trunc = 0;
         int right_trunc = 0;
+        int regex_trunc = 0;
         size_t max = 200;
         if (and_list || or_list || !multi)
             max = 1;
@@ -356,26 +358,23 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
                 ccl_add_attr_numeric(p, attset, CCL_BIB1_STR, 1);
         }
 
+        if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_REGEX,
+                          &attset))
+        {
+            regex_trunc = 1; /* regex trunc (102) allowed */
+        }
+
         /* make the RPN token */
-        p->u.t.term = (char *)xmalloc(len);
+        p->u.t.term = (char *)xmalloc(len * 2 + 2);
         ccl_assert(p->u.t.term);
         p->u.t.term[0] = '\0';
         for (i = 0; i<no; i++)
         {
             const char *src_str = cclp->look_token->name;
             size_t src_len = cclp->look_token->len;
+            int j;
+            int quote_mode = 0;
 
-            if (i == 0 && src_len > 0 && *src_str == '?')
-            {
-                src_len--;
-                src_str++;
-                left_trunc = 1;
-            }
-            if (i == no - 1 && src_len > 0 && src_str[src_len-1] == '?')
-            {
-                src_len--;
-                right_trunc = 1;
-            }
             if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
             {
                 size_t len = strlen(p->u.t.term);
@@ -383,7 +382,61 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
                        cclp->look_token->ws_prefix_len);
                 p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
             }
-            strxcat(p->u.t.term, src_str, src_len);
+            for (j = 0; j < src_len; j++)
+            {
+                if (j > 0 && src_str[j-1] == '\\')
+                {
+                    if (regex_trunc && strchr("()[]?*.", src_str[j]))
+                    {
+                        regex_trunc = 2;
+                        strcat(p->u.t.term, "\\\\");
+                    }
+                    strxcat(p->u.t.term, src_str + j, 1);
+                }
+                else if (src_str[j] == '"')
+                    quote_mode = !quote_mode;
+                else if (!quote_mode && src_str[j] == '?')
+                {
+                    if (regex_trunc)
+                    {
+                        strcat(p->u.t.term, ".*");
+                        regex_trunc = 2; /* regex trunc is really needed */
+                    }
+                    else if (i == 0 && j == 0)
+                        left_trunc = 1;
+                    else if (i == no - 1 && j == src_len - 1)
+                        right_trunc = 1;
+                    else
+                    {
+                        cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH;
+                        ccl_rpn_delete(p);
+                        return NULL;
+                    }
+                }
+                else if (!quote_mode && src_str[j] == '#')
+                {
+                    if (regex_trunc)
+                    {
+                        strcat(p->u.t.term, ".");
+                        regex_trunc = 2; /* regex trunc is really needed */
+                    }
+                    else
+                    {
+                        cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH;
+                        ccl_rpn_delete(p);
+                        return NULL;
+                    }
+                }
+                else if (src_str[j] != '\\')
+                {
+                    if (regex_trunc && strchr("()[]?*.", src_str[j]))
+                    {
+                        regex_trunc = 2;
+                        strcat(p->u.t.term, "\\\\");
+                    }
+                    strxcat(p->u.t.term, src_str + j, 1);                    
+                }
+            }
             ADVANCE;
         }
 
@@ -440,6 +493,10 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
             }
             ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 2);
         }
+        else if (regex_trunc == 2)
+        {
+            ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102);
+        }
         else
         {
             if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE,
index c49df6c..16e658f 100644 (file)
@@ -158,6 +158,8 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name,
                         value = CCL_BIB1_TRU_CAN_BOTH;
                     else if (!ccl_stricmp (value_str, "n"))
                         value = CCL_BIB1_TRU_CAN_NONE;
+                    else if (!ccl_stricmp (value_str, "x"))
+                        value = CCL_BIB1_TRU_CAN_REGEX;
                     break;                
                 case 'c':
                 case 'C':
index 6c74226..5211fe8 100644 (file)
@@ -126,53 +126,60 @@ struct ccl_token *ccl_parser_tokenize(CCL_parser cclp, const char *command)
         default:
             --cp;
             --last->len;
-            if (*cp == '"')
+            
+            last->kind = CCL_TOK_TERM;
+            last->name = (const char *) cp;
+            while (*cp && !strchr("(),%!><= \t\n\r", *cp))
             {
-                cp++;
-                last->kind = CCL_TOK_TERM;
-                last->name = (const char *) cp;
-                while (*cp && *cp != '"')
+                if (*cp == '\\' && cp[1])
                 {
                     cp++;
                     ++ last->len;
                 }
-                if (*cp)
-                    cp++;
-            }
-            else
-            {
-                last->kind = CCL_TOK_TERM;
-                last->name = (const char *) cp;
-                while (*cp && !strchr("(),%!><= \t\n\r", *cp))
+                else if (*cp == '"')
                 {
-                    ++ last->len;
-                    cp++;
-                }
-                aliases = ccl_qual_search_special(cclp->bibset, "and");
-                if (!aliases)
-                    aliases = cclp->ccl_token_and;
-                if (token_cmp(cclp, aliases, last))
-                    last->kind = CCL_TOK_AND;
-                
-                aliases = ccl_qual_search_special(cclp->bibset, "or");
-                if (!aliases)
-                    aliases = cclp->ccl_token_or;
-                if (token_cmp(cclp, aliases, last))
-                    last->kind = CCL_TOK_OR;
-                
-                aliases = ccl_qual_search_special(cclp->bibset, "not");
-                if (!aliases)
-                    aliases = cclp->ccl_token_not;
-                if (token_cmp(cclp, aliases, last))
-                    last->kind = CCL_TOK_NOT;
-                
-                aliases = ccl_qual_search_special(cclp->bibset, "set");
-                if (!aliases)
-                    aliases = cclp->ccl_token_set;
-                
-                if (token_cmp(cclp, aliases, last))
-                    last->kind = CCL_TOK_SET;
+                    while (*cp)
+                    {
+                        cp++;
+                        ++ last->len;
+                        if (*cp == '\\' && cp[1])
+                        {
+                            cp++;
+                            ++ last->len;
+                        }
+                        else if (*cp == '"')
+                            break;
+                    }
+                } 
+                if (!*cp)
+                    break;
+                cp++;
+                ++ last->len;
             }
+            aliases = ccl_qual_search_special(cclp->bibset, "and");
+            if (!aliases)
+                aliases = cclp->ccl_token_and;
+            if (token_cmp(cclp, aliases, last))
+                last->kind = CCL_TOK_AND;
+            
+            aliases = ccl_qual_search_special(cclp->bibset, "or");
+            if (!aliases)
+                aliases = cclp->ccl_token_or;
+            if (token_cmp(cclp, aliases, last))
+                last->kind = CCL_TOK_OR;
+            
+            aliases = ccl_qual_search_special(cclp->bibset, "not");
+            if (!aliases)
+                aliases = cclp->ccl_token_not;
+            if (token_cmp(cclp, aliases, last))
+                last->kind = CCL_TOK_NOT;
+            
+            aliases = ccl_qual_search_special(cclp->bibset, "set");
+            if (!aliases)
+                aliases = cclp->ccl_token_set;
+            
+            if (token_cmp(cclp, aliases, last))
+                last->kind = CCL_TOK_SET;
         }
     }
     return first;
index ae413ea..72370e3 100644 (file)
@@ -79,7 +79,7 @@ void tst1(int pass)
     case 0:
         ccl_qual_fitem(bibset, "u=4    s=pw t=l,r", "ti");
         ccl_qual_fitem(bibset, "1=1016 s=al,pw t=r",    "term");
-        ccl_qual_fitem(bibset, "1=/my/title",         "dc.title");
+        ccl_qual_fitem(bibset, "1=/my/title t=x",       "dc.title");
         ccl_qual_fitem(bibset, "r=r",         "date");
         ccl_qual_fitem(bibset, "r=o",         "x");
         ccl_qual_fitem(bibset, "dc.title", "title");
@@ -92,7 +92,7 @@ void tst1(int pass)
         strcpy(tstline, "term 1=1016 s=al,pw t=r  # default term");
         ccl_qual_line(bibset, tstline);
 
-        strcpy(tstline, "dc.title 1=/my/title");
+        strcpy(tstline, "dc.title 1=/my/title t=x");
         ccl_qual_line(bibset, tstline);
 
         strcpy(tstline, "date r=r # ordered relation");
@@ -111,7 +111,7 @@ void tst1(int pass)
         ccl_qual_buf(bibset, "ti u=4    s=pw t=l,r\n"
                      "term 1=1016 s=al,pw t=r\r\n"
                      "\n"
-                     "dc.title 1=/my/title\n"
+                     "dc.title 1=/my/title t=x\n"
                      "date r=r\n" 
                      "x r=o\n"
                      "title dc.title\n"
@@ -139,6 +139,7 @@ void tst1(int pass)
                 " </qual>\n"
                 " <qual name=\"dc.title\">\n"
                 "   <attr type=\"1\" value=\"/my/title\"/>\n"
+                "   <attr type=\"t\" value=\"x\"/>\n"
                 " </qual>\n"
                 " <qual name=\"date\">\n"
                 "   <attr type=\"r\" value=\"r\"/>\n"
@@ -250,14 +251,33 @@ void tst1(int pass)
     YAZ_CHECK(tst_ccl_query(bibset, "title=a", 
                             "@attr 1=/my/title a "));
 
+    YAZ_CHECK(tst_ccl_query(bibset, "title=a?b#\"c?\"", 
+                            "@attr 5=102 @attr 1=/my/title a.*b.c\\\\? "));
+
+    YAZ_CHECK(tst_ccl_query(bibset, "title=\\(", 
+                            "@attr 5=102 @attr 1=/my/title \\\\( "));
+
+    YAZ_CHECK(tst_ccl_query(bibset, "title=.", 
+                            "@attr 5=102 @attr 1=/my/title \\\\. "));
+
+    YAZ_CHECK(tst_ccl_query(bibset, "title=\\.", 
+                            "@attr 5=102 @attr 1=/my/title \\\\. "));
+
+    YAZ_CHECK(tst_ccl_query(bibset, "title=\".\"", 
+                            "@attr 5=102 @attr 1=/my/title \\\\. "));
+
     YAZ_CHECK(tst_ccl_query(bibset, "comb=a", 
                             "@or @attr 4=2 @attr 1=1016 a "
                             "@attr 1=/my/title a "));
 
     YAZ_CHECK(tst_ccl_query(bibset, "a? b?", 
-                            /* correct */
                             "@and @attr 5=1 @attr 4=2 @attr 1=1016 a "
                             "@attr 5=1 @attr 4=2 @attr 1=1016 b "));
+
+    YAZ_CHECK(tst_ccl_query(bibset, "\"a\"? \"b?\"", 
+                            "@and @attr 5=1 @attr 4=2 @attr 1=1016 a "
+                            "@attr 4=2 @attr 1=1016 b? "));
+
     ccl_qual_rm(&bibset);
 }