From c508282c3e52e145f998d0bb85c0ea6b36fe956c Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 29 Aug 2011 15:57:05 +0200 Subject: [PATCH] CQL/CCL/PQF conversion fixes. CCL support for Z39.58 trunc Lots of fixes for the conversions from CQL to CCL; CCL to RPN; RPN to CQL. yaz_encode_pqf_term's term parameter properly escaped into PQF term. CCL truncation configuration t=z enables Z39.58 truncation which is in fact CCL truncation, but there's more to it than one might think anyway. The RPN to CCL conversion handles Z39.58 truncation as well (5=104). --- doc/tools.xml | 9 ++++ include/yaz/ccl.h | 2 +- src/cclfind.c | 37 ++++++++++++-- src/cclqfile.c | 2 + src/querytowrbuf.c | 13 +++-- src/rpn2cql.c | 36 ++++++++------ test/test_ccl.c | 138 ++++++++++++++++++++++++++++++++++----------------- test/test_rpn2cql.c | 49 ++++++++++++++++++ 8 files changed, 218 insertions(+), 68 deletions(-) diff --git a/doc/tools.xml b/doc/tools.xml index cc3e3c5..80a9756 100644 --- a/doc/tools.xml +++ b/doc/tools.xml @@ -740,6 +740,15 @@ + t=z + Allows masking anywhere in a term, thus fully supporting + # (mask one character) and ? (zero or more of any). + If masking is used, trunction is set to 104 (Z39.58 in term) + and the term is converted accordingly to Z39.58 masking term - + actually the same truncation as CCL itself. + + + diff --git a/include/yaz/ccl.h b/include/yaz/ccl.h index d3e3032..f65674a 100644 --- a/include/yaz/ccl.h +++ b/include/yaz/ccl.h @@ -353,7 +353,7 @@ int ccl_stop_words_info(ccl_stop_words_t csw, int idx, #define CCL_BIB1_TRU_CAN_BOTH (-3) #define CCL_BIB1_TRU_CAN_NONE (-4) #define CCL_BIB1_TRU_CAN_REGEX (-5) - +#define CCL_BIB1_TRU_CAN_Z3958 (-6) YAZ_END_CDECL diff --git a/src/cclfind.c b/src/cclfind.c index 6fd3144..7932ecb 100644 --- a/src/cclfind.c +++ b/src/cclfind.c @@ -213,7 +213,8 @@ void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set, } -#define REGEX_CHARS "^[]{}()|.*+?!\"$" +#define REGEX_CHARS "^[]{}()|.*+?!$" +#define CCL_CHARS "#?\\" /** * search_term: Parse CCL search term. * cclp: CCL Parser @@ -261,6 +262,7 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, int left_trunc = 0; int right_trunc = 0; int regex_trunc = 0; + int z3958_trunc = 0; size_t max = 200; if (and_list || or_list || !multi) max = 1; @@ -364,6 +366,11 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, { regex_trunc = 1; /* regex trunc (102) allowed */ } + else if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_Z3958, + &attset)) + { + z3958_trunc = 1; /* Z39.58 trunc (CCL) trunc allowed */ + } /* make the RPN token */ p->u.t.term = (char *)xmalloc(len * 2 + 2); @@ -390,10 +397,13 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, if (regex_trunc && strchr(REGEX_CHARS "\\", src_str[j])) { regex_trunc = 2; - strcat(p->u.t.term, "\\\\"); + strcat(p->u.t.term, "\\"); } - if (src_str[j] == '\\') + else if (z3958_trunc && strchr(CCL_CHARS "\\", src_str[j])) + { + z3958_trunc = 2; strcat(p->u.t.term, "\\"); + } strxcat(p->u.t.term, src_str + j, 1); } else if (src_str[j] == '"') @@ -405,6 +415,11 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, strcat(p->u.t.term, ".*"); regex_trunc = 2; /* regex trunc is really needed */ } + else if (z3958_trunc) + { + strcat(p->u.t.term, "?"); + z3958_trunc = 2; + } else if (i == 0 && j == 0) left_trunc = 1; else if (i == no - 1 && j == src_len - 1) @@ -423,6 +438,11 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, strcat(p->u.t.term, "."); regex_trunc = 2; /* regex trunc is really needed */ } + else if (z3958_trunc) + { + strcat(p->u.t.term, "#"); + z3958_trunc = 2; + } else { cclp->error_code = CCL_ERR_TRUNC_NOT_BOTH; @@ -435,7 +455,12 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, if (regex_trunc && strchr(REGEX_CHARS, src_str[j])) { regex_trunc = 2; - strcat(p->u.t.term, "\\\\"); + strcat(p->u.t.term, "\\"); + } + else if (z3958_trunc && strchr(CCL_CHARS, src_str[j])) + { + z3958_trunc = 2; + strcat(p->u.t.term, "\\"); } strxcat(p->u.t.term, src_str + j, 1); } @@ -500,6 +525,10 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp, { ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 102); } + else if (z3958_trunc == 2) + { + ccl_add_attr_numeric(p, attset, CCL_BIB1_TRU, 104); + } else { if (qual_val_type(qa, CCL_BIB1_TRU, CCL_BIB1_TRU_CAN_NONE, diff --git a/src/cclqfile.c b/src/cclqfile.c index 16e658f..d2ca1c9 100644 --- a/src/cclqfile.c +++ b/src/cclqfile.c @@ -160,6 +160,8 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, value = CCL_BIB1_TRU_CAN_NONE; else if (!ccl_stricmp (value_str, "x")) value = CCL_BIB1_TRU_CAN_REGEX; + else if (!ccl_stricmp (value_str, "z")) + value = CCL_BIB1_TRU_CAN_Z3958; break; case 'c': case 'C': diff --git a/src/querytowrbuf.c b/src/querytowrbuf.c index d52e37f..8ca78a2 100644 --- a/src/querytowrbuf.c +++ b/src/querytowrbuf.c @@ -22,14 +22,21 @@ void yaz_encode_pqf_term(WRBUF b, const char *term, int len) for (i = 0; i < len; i++) if (strchr(" \"{", term[i])) break; - if (i == len && i) - wrbuf_write(b, term, len); + if (len > 0 && i == len) + { + for (i = 0; iterm; const char *sterm = 0; size_t lterm = 0; + Odr_int trunc = lookup_truncation(apt->attributes); + size_t i; + int must_quote = 0; wrbuf_rewind(w); ret = rpn2cql_attr(ct, apt->attributes, w); - switch(term->which) + switch (term->which) { case Z_Term_general: lterm = term->u.general->len; @@ -210,25 +213,17 @@ static int rpn2cql_simple(cql_transform_t ct, lterm = strlen(sterm); break; default: - ret = -1; cql_transform_set_error(ct, YAZ_BIB1_TERM_TYPE_UNSUPP, 0); + return -1; } - if (term) + if (trunc <= 3 || trunc == 100 || trunc == 102 || trunc == 104) { - size_t i; - int must_quote = 0; - Odr_int trunc = lookup_truncation(apt->attributes); - - if (trunc > 3 && trunc != 100 && trunc != 102) - { - cql_transform_set_error( - ct, YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE, 0); - ret = -1; - } for (i = 0 ; i < lterm; i++) if (strchr(" ()=>u.complex->roperator; + Z_ProximityOperator *prox; int r; if (nested) @@ -301,7 +307,7 @@ static int rpn2cql_structure(cql_transform_t ct, break; case Z_Operator_prox: { pr(" prox", client_data); - Z_ProximityOperator *prox = op->u.prox; + prox = op->u.prox; /* No way to express Odr_bool *exclusion -- ignore it */ if (prox->distance) { char buf[21]; /* Enough for any 64-bit int */ diff --git a/test/test_ccl.c b/test/test_ccl.c index 76d76ba..bb54243 100644 --- a/test/test_ccl.c +++ b/test/test_ccl.c @@ -79,7 +79,9 @@ void tst1(int pass) case 0: ccl_qual_fitem(bibset, "u=4 s=pw t=l,r", "ti"); ccl_qual_fitem(bibset, "1=1016 s=al,pw t=r", "term"); - ccl_qual_fitem(bibset, "1=/my/title t=x", "dc.title"); + ccl_qual_fitem(bibset, "t=x", "reg"); + ccl_qual_fitem(bibset, "t=z", "z"); + ccl_qual_fitem(bibset, "1=/my/title", "dc.title"); ccl_qual_fitem(bibset, "r=r", "date"); ccl_qual_fitem(bibset, "r=o", "x"); ccl_qual_fitem(bibset, "dc.title", "title"); @@ -92,7 +94,13 @@ void tst1(int pass) strcpy(tstline, "term 1=1016 s=al,pw t=r # default term"); ccl_qual_line(bibset, tstline); - strcpy(tstline, "dc.title 1=/my/title t=x"); + strcpy(tstline, "reg t=x"); + ccl_qual_line(bibset, tstline); + + strcpy(tstline, "z t=z"); + ccl_qual_line(bibset, tstline); + + strcpy(tstline, "dc.title 1=/my/title"); ccl_qual_line(bibset, tstline); strcpy(tstline, "date r=r # ordered relation"); @@ -111,7 +119,9 @@ void tst1(int pass) ccl_qual_buf(bibset, "ti u=4 s=pw t=l,r\n" "term 1=1016 s=al,pw t=r\r\n" "\n" - "dc.title 1=/my/title t=x\n" + "reg t=x\r\n" + "z t=z\r\n" + "dc.title 1=/my/title\n" "date r=r\n" "x r=o\n" "title dc.title\n" @@ -137,9 +147,14 @@ void tst1(int pass) " \n" " \n" " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" " \n" " \n" - " \n" " \n" " \n" " \n" @@ -222,24 +237,41 @@ void tst1(int pass) "@attr 4=2 @attr 1=1016 a " "@attr 4=2 @attr 1=1016 b ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=1980", "@attr 2=3 1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=234-1990", "@and @attr 2=4 234 @attr 2=2 1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=234- 1990", "@and @attr 2=4 234 @attr 2=2 1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=234 -1990", "@and @attr 2=4 234 @attr 2=2 1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=234 - 1990", "@and @attr 2=4 234 @attr 2=2 1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=-1980", "@attr 2=2 1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "date=- 1980", "@attr 2=2 1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "x=-1980", "@attr 2=3 -1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "x=- 1980", "@attr 2=2 1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "x= -1980", "@attr 2=3 -1980 ")); - YAZ_CHECK(tst_ccl_query(bibset, "x=234-1990", "@attr 2=3 234-1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "x=234 - 1990", "@and @attr 2=4 234 @attr 2=2 1990 ")); - YAZ_CHECK(tst_ccl_query(bibset, "ti=a,b", "@attr 4=1 @attr 1=4 a,b ")); - YAZ_CHECK(tst_ccl_query(bibset, "ti=a, b", "@attr 4=1 @attr 1=4 \"a, b\" ")); - YAZ_CHECK(tst_ccl_query(bibset, "ti=a-b", "@attr 4=2 @attr 1=4 a-b ")); - YAZ_CHECK(tst_ccl_query(bibset, "ti=a - b", "@attr 4=1 @attr 1=4 \"a - b\" ")); - - YAZ_CHECK(tst_ccl_query(bibset, "a?", "@attr 5=1 @attr 4=2 @attr 1=1016 a ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=1980", + "@attr 2=3 1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=234-1990", + "@and @attr 2=4 234 @attr 2=2 1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=234- 1990", + "@and @attr 2=4 234 @attr 2=2 1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=234 -1990", + "@and @attr 2=4 234 @attr 2=2 1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=234 - 1990", + "@and @attr 2=4 234 @attr 2=2 1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=-1980", + "@attr 2=2 1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "date=- 1980", + "@attr 2=2 1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "x=-1980", + "@attr 2=3 -1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "x=- 1980", + "@attr 2=2 1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "x= -1980", + "@attr 2=3 -1980 ")); + YAZ_CHECK(tst_ccl_query(bibset, "x=234-1990", + "@attr 2=3 234-1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "x=234 - 1990", + "@and @attr 2=4 234 @attr 2=2 1990 ")); + YAZ_CHECK(tst_ccl_query(bibset, "ti=a,b", + "@attr 4=1 @attr 1=4 a,b ")); + YAZ_CHECK(tst_ccl_query(bibset, "ti=a, b", + "@attr 4=1 @attr 1=4 \"a, b\" ")); + YAZ_CHECK(tst_ccl_query(bibset, "ti=a-b", + "@attr 4=2 @attr 1=4 a-b ")); + YAZ_CHECK(tst_ccl_query(bibset, "ti=a - b", + "@attr 4=1 @attr 1=4 \"a - b\" ")); + + YAZ_CHECK(tst_ccl_query(bibset, "a?", + "@attr 5=1 @attr 4=2 @attr 1=1016 a ")); YAZ_CHECK(tst_ccl_query(bibset, "a b", "@and @attr 4=2 @attr 1=1016 a " "@attr 4=2 @attr 1=1016 b ")); @@ -251,29 +283,45 @@ void tst1(int pass) YAZ_CHECK(tst_ccl_query(bibset, "title=a", "@attr 1=/my/title a ")); - YAZ_CHECK(tst_ccl_query(bibset, "title=a?b#\"c?\"", - "@attr 5=102 @attr 1=/my/title a.*b.c\\\\? ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=\\(", - "@attr 5=102 @attr 1=/my/title \\\\( ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=.", - "@attr 5=102 @attr 1=/my/title \\\\. ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=.", - "@attr 5=102 @attr 1=/my/title \\\\. ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=\".\"", - "@attr 5=102 @attr 1=/my/title \\\\. ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=?\\?", - "@attr 5=102 @attr 1=/my/title .*\\\\? ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=\"?\\?\"", - "@attr 5=102 @attr 1=/my/title \\\\?\\\\? ")); - - YAZ_CHECK(tst_ccl_query(bibset, "title=\\\\", - "@attr 5=102 @attr 1=/my/title \\\\\\\\ ")); + YAZ_CHECK(tst_ccl_query(bibset, "reg=a?b#\"c?\"", + "@attr 5=102 a.*b.c\\\\? ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=a?b#\"c?\"", + "@attr 5=104 a?b#c\\\\? ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=\\(", + "@attr 5=102 \\\\( ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=\\(", + "( ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=\\\"", + "\"\\\"\" ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=\\\"", + "\"\\\"\" ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=.", + "@attr 5=102 \\\\. ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=.", + ". ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=\".\"", + "@attr 5=102 \\\\. ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=\".\"", + ". ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=?\\?", + "@attr 5=102 .*\\\\? ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=?\\?", + "@attr 5=104 ?\\\\? ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=\"?\\?\"", + "@attr 5=102 \\\\?\\\\? ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=\"?\\?\"", + "@attr 5=104 \\\\?\\\\? ")); + + YAZ_CHECK(tst_ccl_query(bibset, "reg=\\\\", + "@attr 5=102 \\\\\\\\ ")); + YAZ_CHECK(tst_ccl_query(bibset, "z=\\\\", + "@attr 5=104 \\\\\\\\ ")); YAZ_CHECK(tst_ccl_query(bibset, "\\\\", "@attr 4=2 @attr 1=1016 \\\\ ")); diff --git a/test/test_rpn2cql.c b/test/test_rpn2cql.c index bdb4972..fd0e5a6 100644 --- a/test/test_rpn2cql.c +++ b/test/test_rpn2cql.c @@ -40,6 +40,11 @@ static int compare(cql_transform_t ct, const char *pqf, const char *cql) { ret = 1; } + else + { + yaz_log(YLOG_WARN, " expected: %s", cql ? cql : "null"); + yaz_log(YLOG_WARN, " got: %s", wrbuf_cstr(w)); + } } } wrbuf_destroy(w); @@ -90,10 +95,54 @@ static void tst2(void) YAZ_CHECK(compare(ct, "@attr 1=30 @attr 2=5 1980", "dc.date>1980")); YAZ_CHECK(compare(ct, "@attr 1=30 @attr 2=2 1980", "dc.date<=1980")); YAZ_CHECK(compare(ct, "@attr 1=30 @attr 2=4 1980", "dc.date>=1980")); + /* Truncation */ YAZ_CHECK(compare(ct, "@attr 5=1 water", "water*")); YAZ_CHECK(compare(ct, "@attr 5=2 water", "*water")); YAZ_CHECK(compare(ct, "@attr 5=3 water", "*water*")); + YAZ_CHECK(compare(ct, "@attr 5=100 water", "water")); + YAZ_CHECK(compare(ct, "@attr 5=102 water", "water")); + YAZ_CHECK(compare(ct, "@attr 5=104 water", "water")); + + YAZ_CHECK(compare(ct, "@attr 5=102 wat.*er", "wat*er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat?er", "wat*er")); + + YAZ_CHECK(compare(ct, "@attr 5=102 wat.er", "wat?er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat#er", "wat?er")); + YAZ_CHECK(compare(ct, "@attr 5=102 wat?er", "wat\\?er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat*er", "wat\\*er")); + YAZ_CHECK(compare(ct, "@attr 5=102 wat#er", "wat#er")); + + /* \. is 'eaten' by PQF parser */ + YAZ_CHECK(compare(ct, "@attr 5=102 wat\\.er", "wat?er")); + + /* Escape sequences */ + /* note: escape sequences that survive after PQF parse below */ + YAZ_CHECK(compare(ct, "@attr 5=102 wat\\\\?er", "wat\\?er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat\\\\?er", "wat\\?er")); + + YAZ_CHECK(compare(ct, "@attr 5=102 wat\\\\*er", "wat\\*er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat\\\\*er", "wat\\*er")); + + YAZ_CHECK(compare(ct, "wat\\\\#er", "wat#er")); + YAZ_CHECK(compare(ct, "@attr 5=100 wat\\\\#er", "wat#er")); + YAZ_CHECK(compare(ct, "@attr 5=102 wat\\\\#er", "wat#er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat\\\\#er", "wat#er")); + YAZ_CHECK(compare(ct, "@attr 5=102 wat\\\\.er", "wat.er")); + YAZ_CHECK(compare(ct, "@attr 5=104 wat\\\\.er", "wat.er")); + + /* Quoting */ + YAZ_CHECK(compare(ct, "@attr 5=100 \"\"", "\"\"")); + YAZ_CHECK(compare(ct, "@attr 5=1 \"\"", "\"*\"")); + YAZ_CHECK(compare(ct, "@attr 5=2 \"\"", "\"*\"")); + YAZ_CHECK(compare(ct, "@attr 5=3 \"\"", "\"**\"")); + YAZ_CHECK(compare(ct, "@attr 5=102 \"\"", "\"\"")); + YAZ_CHECK(compare(ct, "@attr 5=104 \"\"", "\"\"")); + + YAZ_CHECK(compare(ct, "@attr 5=1 \"water basket\"", "\"water basket*\"")); + YAZ_CHECK(compare(ct, "@attr 5=2 \"water basket\"", "\"*water basket\"")); + YAZ_CHECK(compare(ct, "@attr 5=3 \"water basket\"", "\"*water basket*\"")); + /* Other */ YAZ_CHECK(compare(ct, "@attr 2=103 @attr 1=_ALLRECORDS 1", "cql.allRecords=1")); YAZ_CHECK(compare(ct, "@attr 1=500 abc", 0)); -- 1.7.10.4