Moved stop word support code to separate ccl_stop_words.c and
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 1 May 2007 12:22:10 +0000 (12:22 +0000)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 1 May 2007 12:22:10 +0000 (12:22 +0000)
encapsulated the private info in opaque ptr ccl_stop_words_t.

include/yaz/ccl.h
src/Makefile.am
src/ccl_stop_words.c [new file with mode: 0644]
src/cclfind.c
src/cclp.h
src/cclqual.c
util/cclsh.c

index 043d09f..e6c5af2 100644 (file)
@@ -49,7 +49,7 @@
 /*
  * CCL - header file
  *
- * $Id: ccl.h,v 1.28 2007-04-30 19:55:39 adam Exp $
+ * $Id: ccl.h,v 1.29 2007-05-01 12:22:10 adam Exp $
  *
  * Old Europagate Log:
  *
@@ -150,6 +150,7 @@ struct ccl_rpn_node {
         /** \brief Attributes + Term */
         struct {
             char *term;
+            char *qual;
             struct ccl_rpn_attr *attr_list;
         } t;
         /** Result set */
@@ -287,6 +288,31 @@ YAZ_EXPORT
 void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set,
                          int type, char *value);
 
+YAZ_EXPORT
+int ccl_search_stop(CCL_bibset bibset, const char *qname,
+                    const char *src_str, size_t src_len);
+
+
+/** \brief stop words handle (pimpl) */
+typedef struct ccl_stop_words *ccl_stop_words_t;
+
+/** \brief creates stop words handle */
+YAZ_EXPORT
+ccl_stop_words_t ccl_stop_words_create(void);
+
+/** \brief destroys stop words handle */
+YAZ_EXPORT
+void ccl_stop_words_destroy(ccl_stop_words_t csw);
+
+/** \brief removes stop words from RPN tree */
+YAZ_EXPORT
+int ccl_stop_words_tree(ccl_stop_words_t csw,
+                        CCL_bibset bibset, struct ccl_rpn_node **t);
+
+/** \brief returns information about removed "stop" words */
+YAZ_EXPORT
+int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
+                        const char **qualname, const char **term);
 
 #ifndef ccl_assert
 #define ccl_assert(x) ;
index 2857942..576559a 100644 (file)
@@ -1,6 +1,6 @@
 ## This file is part of the YAZ toolkit.
 ## Copyright (C) 1995-2007, Index Data, All rights reserved.
-## $Id: Makefile.am,v 1.66 2007-04-30 08:29:07 adam Exp $
+## $Id: Makefile.am,v 1.67 2007-05-01 12:22:11 adam Exp $
 
 YAZ_VERSION_INFO=3:0:0
 
@@ -86,7 +86,7 @@ libyaz_la_SOURCES=version.c options.c log.c \
   zoom-c.c zoom-socket.c zoom-opt.c zoom-p.h \
   grs1disp.c zgdu.c soap.c srw.c srwutil.c \
   opacdisp.c cclfind.c ccltoken.c cclerrms.c cclqual.c cclptree.c cclp.h \
-  cclqfile.c cclstr.c cclxmlconfig.c \
+  cclqfile.c cclstr.c cclxmlconfig.c ccl_stop_words.c \
   cql.y cqlstdio.c cqltransform.c cqlutil.c xcqlutil.c cqlstring.c \
   cqlstrer.c querytowrbuf.c \
   tcpdchk.c \
diff --git a/src/ccl_stop_words.c b/src/ccl_stop_words.c
new file mode 100644 (file)
index 0000000..6a36e92
--- /dev/null
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 1995, the EUROPAGATE consortium (see below).
+ *
+ * The EUROPAGATE consortium members are:
+ *
+ *    University College Dublin
+ *    Danmarks Teknologiske Videnscenter
+ *    An Chomhairle Leabharlanna
+ *    Consejo Superior de Investigaciones Cientificas
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and
+ * its documentation, in whole or in part, for any purpose, is hereby granted,
+ * provided that:
+ *
+ * 1. This copyright and permission notice appear in all copies of the
+ * software and its documentation. Notices of copyright or attribution
+ * which appear at the beginning of any file must remain unchanged.
+ *
+ * 2. The names of EUROPAGATE or the project partners may not be used to
+ * endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * 3. Users of this software (implementors and gateway operators) agree to
+ * inform the EUROPAGATE consortium of their use of the software. This
+ * information will be used to evaluate the EUROPAGATE project and the
+ * software, and to plan further developments. The consortium may use
+ * the information in later publications.
+ * 
+ * 4. Users of this software agree to make their best efforts, when
+ * documenting their use of the software, to acknowledge the EUROPAGATE
+ * consortium, and the role played by the software in their work.
+ *
+ * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF
+ * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+ * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND
+ * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
+ * USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+/** 
+ * \file ccl_stop_words.c
+ * \brief Removes stop words from terms in RPN tree
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <yaz/ccl.h>
+#include <yaz/nmem.h>
+
+struct ccl_stop_info {
+    char *qualname;
+    char *term;
+    struct ccl_stop_info *next;
+};
+
+struct ccl_stop_words {
+    char *blank_chars;
+    NMEM nmem; /* memory for removed items */
+    struct ccl_stop_info *removed_items;
+};
+    
+static void append_removed_item(ccl_stop_words_t csw,
+                                const char *qname,
+                                const char *t, size_t len)
+{
+    struct ccl_stop_info *csi = nmem_malloc(csw->nmem, sizeof(*csi));
+    struct ccl_stop_info **csip = &csw->removed_items;
+    if (qname)
+        csi->qualname = nmem_strdup(csw->nmem, qname);
+    else
+        csi->qualname = 0;
+
+    csi->term = nmem_malloc(csw->nmem, len+1);
+    memcpy(csi->term, t, len);
+    csi->term[len] = '\0';
+    csi->next = 0;
+
+    while (*csip)
+        csip = &(*csip)->next;
+    
+    *csip = csi;
+}
+
+ccl_stop_words_t ccl_stop_words_create(void)
+{
+    NMEM nmem = nmem_create();
+    ccl_stop_words_t csw = xmalloc(sizeof(*csw));
+    csw->nmem = nmem;
+    csw->removed_items = 0;
+    csw->blank_chars = xstrdup(" \r\n\t");
+    return csw;
+}
+
+void ccl_stop_words_destroy(ccl_stop_words_t csw)
+{
+    if (csw)
+    {
+        nmem_destroy(csw->nmem);
+        xfree(csw->blank_chars);
+        xfree(csw);
+    }
+}
+
+struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw,
+                                       CCL_bibset bibset,
+                                       struct ccl_rpn_node *p)
+{
+    struct ccl_rpn_node *left, *right;
+    switch (p->kind)
+    {
+    case CCL_RPN_AND:
+    case CCL_RPN_OR:
+    case CCL_RPN_NOT:
+    case CCL_RPN_PROX:
+        left = ccl_remove_stop_r(csw, bibset, p->u.p[0]);
+        right = ccl_remove_stop_r(csw, bibset, p->u.p[1]);
+        if (!left || !right)
+        {
+            /* we must delete our binary node and return child (if any) */
+            p->u.p[0] = 0;
+            p->u.p[1] = 0;
+            ccl_rpn_delete(p);
+            if (left)
+                return left;
+            else
+                return right;
+        }
+        break;
+    case CCL_RPN_SET:
+        break;
+    case CCL_RPN_TERM:
+        if (p->u.t.term)
+        {
+            int found = 1;
+            while (found)
+            {
+                char *cp = p->u.t.term;
+                found = 0;
+                while (1)
+                {
+                    while (*cp && strchr(csw->blank_chars, *cp))
+                        cp++;
+                    if (!*cp)
+                        break;
+                    else
+                    {
+                        char *cp0 = cp;
+                        while (*cp && !strchr(csw->blank_chars, *cp))
+                            cp++;
+                        if (cp != cp0)
+                        {
+                            size_t len = cp - cp0;
+                            if (ccl_search_stop(bibset, p->u.t.qual,
+                                                cp0, len))
+                            {
+                                append_removed_item(csw, p->u.t.qual,
+                                                    cp0, len);
+                                while (*cp && strchr(csw->blank_chars, *cp))
+                                    cp++;
+                                memmove(cp0, cp, strlen(cp)+1);
+                                found = 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        /* chop right blanks .. and see if term it gets empty */
+        if (p->u.t.term && csw->removed_items)
+        {
+            char *cp = p->u.t.term + strlen(p->u.t.term);
+            while (1)
+            {
+                if (cp == p->u.t.term)
+                {
+                    /* term is empty / blank */
+                    ccl_rpn_delete(p);
+                    return 0;
+                }
+                if (!strchr(csw->blank_chars, cp[-1]))
+                    break;
+                /* chop right */
+                cp[-1] = 0;
+                --cp;
+            }
+        }
+        break;
+    }
+    return p;
+}
+
+int ccl_stop_words_tree(ccl_stop_words_t csw,
+                        CCL_bibset bibset, struct ccl_rpn_node **t)
+{
+    struct ccl_rpn_node *r;
+    
+    /* remove list items */
+    nmem_reset(csw->nmem);
+    csw->removed_items = 0;
+    
+    r = ccl_remove_stop_r(csw, bibset, *t);
+    *t = r;
+    if (csw->removed_items)
+        return 1;
+    return 0;
+}
+
+int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
+                        const char **qualname, const char **term)
+{
+    struct ccl_stop_info *csi = csw->removed_items;
+    int i = 0;
+    while (csi && i < idx)
+    {
+        csi = csi->next;
+        i++;
+    }
+    if (csi)
+    {
+        *qualname = csi->qualname;
+        *term = csi->term;
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
index 69f059c..17d0470 100644 (file)
@@ -56,7 +56,7 @@
 /* CCL find (to rpn conversion)
  * Europagate, 1995
  *
- * $Id: cclfind.c,v 1.13 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclfind.c,v 1.14 2007-05-01 12:22:11 adam Exp $
  *
  * Old Europagate log:
  *
@@ -199,6 +199,7 @@ struct ccl_rpn_node *ccl_rpn_node_create(enum ccl_rpn_kind kind)
     case CCL_RPN_TERM:
         p->u.t.attr_list = 0;
         p->u.t.term = 0;
+        p->u.t.qual = 0;
         break;
     default:
         break;
@@ -225,6 +226,7 @@ void ccl_rpn_delete(struct ccl_rpn_node *rpn)
         break;
     case CCL_RPN_TERM:
         xfree(rpn->u.t.term);
+        xfree(rpn->u.t.qual);
         for (attr = rpn->u.t.attr_list; attr; attr = attr1)
         {
             attr1 = attr->next;
@@ -392,6 +394,12 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
         p = ccl_rpn_node_create(CCL_RPN_TERM);
         p->u.t.attr_list = NULL;
         p->u.t.term = NULL;
+        if (qa && qa[0])
+        {
+            const char *n = ccl_qual_get_name(qa[0]);
+            if (n)
+                p->u.t.qual = xstrdup(n);
+        }
 
         /* go through all attributes and add them to the attribute list */
         for (i=0; qa && qa[i]; i++)
@@ -471,30 +479,17 @@ static struct ccl_rpn_node *search_term_x(CCL_parser cclp,
             }
             if (i == no-1 && right_trunc)
                 src_len--;
-            if (!ccl_qual_match_stop(cclp->bibset, qa, src_str, src_len))
+            if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
             {
-#if 0
-                fprintf(stderr, "[%s %.*s]",
-                        ccl_qual_get_name(qa[0]), src_len, src_str);
-#endif
-                if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
-                {
-                    size_t len = strlen(p->u.t.term);
-                    memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf,
-                           cclp->look_token->ws_prefix_len);
-                    p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
-                }
-                strxcat(p->u.t.term, src_str, src_len);
+                size_t len = strlen(p->u.t.term);
+                memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf,
+                       cclp->look_token->ws_prefix_len);
+                p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
             }
+            strxcat(p->u.t.term, src_str, src_len);
             ADVANCE;
         }
 
-        if (p->u.t.term[0] == 0)
-        {
-            ccl_rpn_delete(p);
-            continue;
-        }
-
         /* make the top node point to us.. */
         if (p_top)
         {
@@ -1204,6 +1199,7 @@ struct ccl_rpn_node *ccl_find_str(CCL_bibset bibset, const char *str,
     ccl_token_del(list);
     return p;
 }
+
 /*
  * Local variables:
  * c-basic-offset: 4
index a3b1e90..24930f3 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (C) 1995-2005, Index Data ApS
  * See the file LICENSE for details.
  *
- * $Id: cclp.h,v 1.4 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclp.h,v 1.5 2007-05-01 12:22:11 adam Exp $
  */
 
 /** 
@@ -99,10 +99,6 @@ struct ccl_rpn_attr *ccl_qual_get_attr(ccl_qualifier_t q);
 YAZ_EXPORT
 const char *ccl_qual_get_name(ccl_qualifier_t q);
 
-YAZ_EXPORT
-int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, 
-                        const char *src_str,  size_t src_len);
-
 /*
  * Local variables:
  * c-basic-offset: 4
index 7843494..67ef7fa 100644 (file)
@@ -48,7 +48,7 @@
 /* CCL qualifiers
  * Europagate, 1995
  *
- * $Id: cclqual.c,v 1.9 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclqual.c,v 1.10 2007-05-01 12:22:11 adam Exp $
  *
  * Old Europagate Log:
  *
@@ -397,31 +397,30 @@ const char **ccl_qual_search_special(CCL_bibset b, const char *name)
     return 0;
 }
 
-int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa, 
-                        const char *src_str, size_t src_len)
+int ccl_search_stop(CCL_bibset bibset, const char *qname,
+                    const char *src_str, size_t src_len)
 {
-    if (qa[0])
+    const char **slist = 0;
+    if (qname)
     {
-        char qname[80];
-        const char **slist;
-        yaz_snprintf(qname, sizeof(qname)-1, "stop.%s",
-                     ccl_qual_get_name(qa[0]));
-        slist = ccl_qual_search_special(bibset, qname);
-        if (!slist)
-            slist = ccl_qual_search_special(bibset, "stop.*");
-        if (slist)
-        {
-            int i;
-            for (i = 0; slist[i]; i++)
-                if (src_len == strlen(slist[i]) 
-                    && ccl_memicmp(slist[i], src_str, src_len) == 0)
-                    return 1;
-        }
+        char qname_buf[80];
+        yaz_snprintf(qname_buf, sizeof(qname_buf)-1, "stop.%s",
+                     qname);
+        slist = ccl_qual_search_special(bibset, qname_buf);
+    }
+    if (!slist)
+        slist = ccl_qual_search_special(bibset, "stop.*");
+    if (slist)
+    {
+        int i;
+        for (i = 0; slist[i]; i++)
+            if (src_len == strlen(slist[i]) 
+                && ccl_memicmp(slist[i], src_str, src_len) == 0)
+                return 1;
     }
     return 0;
 }
 
-
 /*
  * Local variables:
  * c-basic-offset: 4
index b1e06e1..b9aafc0 100644 (file)
@@ -44,7 +44,7 @@
 /* CCL shell.
  * Europagate 1995
  *
- * $Id: cclsh.c,v 1.7 2007-04-30 19:50:22 adam Exp $
+ * $Id: cclsh.c,v 1.8 2007-05-01 12:22:11 adam Exp $
  *
  * Old Europagate Log:
  *
@@ -104,11 +104,11 @@ static char *prog;
 
 void usage(const char *prog)
 {
-    fprintf (stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog);
-    exit (1);
+    fprintf(stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog);
+    exit(1);
 }
 
-int main (int argc, char **argv)
+int main(int argc, char **argv)
 {
     CCL_bibset bibset;
     FILE *bib_inf;
@@ -122,7 +122,7 @@ int main (int argc, char **argv)
     WRBUF q_wrbuf = 0;
 
     prog = *argv;
-    bibset = ccl_qual_mk ();    
+    bibset = ccl_qual_mk();    
     
     while ((ret = options("db:x:", argv, argc, &arg)) != -2)
     {
@@ -133,15 +133,15 @@ int main (int argc, char **argv)
             break;
         case 'b':
             bib_fname = arg;
-            bib_inf = fopen (bib_fname, "r");
+            bib_inf = fopen(bib_fname, "r");
             if (!bib_inf)
             {
-                fprintf (stderr, "%s: cannot open %s\n", prog,
+                fprintf(stderr, "%s: cannot open %s\n", prog,
                          bib_fname);
-                exit (1);
+                exit(1);
             }
-            ccl_qual_file (bibset, bib_inf);
-            fclose (bib_inf);
+            ccl_qual_file(bibset, bib_inf);
+            fclose(bib_inf);
             break;
 #if YAZ_HAVE_XML2
         case 'x':
@@ -176,23 +176,23 @@ int main (int argc, char **argv)
         int error;
         struct ccl_rpn_node *rpn;
         
-        rpn = ccl_parser_find_str (cclp, wrbuf_cstr(q_wrbuf));
+        rpn = ccl_parser_find_str(cclp, wrbuf_cstr(q_wrbuf));
         
         error = ccl_parser_get_error(cclp, 0);
         
         if (error)
         {
-            printf ("%s\n", ccl_err_msg (error));
+            printf("%s\n", ccl_err_msg(error));
         }
         else
         {
             if (rpn)
             {
-                ccl_pr_tree (rpn, stdout);
-                printf ("\n");
+                ccl_pr_tree(rpn, stdout);
+                printf("\n");
             }
         }
-        ccl_parser_destroy (cclp);
+        ccl_parser_destroy(cclp);
         if (rpn)
             ccl_rpn_delete(rpn);
         wrbuf_destroy(q_wrbuf);
@@ -218,10 +218,10 @@ int main (int argc, char **argv)
                 break;
             }
             strcpy(buf,line_in);
-            free (line_in);
+            free(line_in);
 #else    
-        printf ("CCLSH>"); fflush (stdout);
-        if (!fgets (buf, 999, stdin))
+        printf("CCLSH>"); fflush(stdout);
+        if (!fgets(buf, 999, stdin))
             break;
 #endif 
 
@@ -236,23 +236,43 @@ int main (int argc, char **argv)
 
             if (error)
             {
-                printf ("%*s^ - ", 6+pos, " ");
-                printf ("%s\n", ccl_err_msg (error));
+                printf("%*s^ - ", 6+pos, " ");
+                printf("%s\n", ccl_err_msg(error));
             }
             else
             {
                 if (rpn && i == 0)
                 {
-                    ccl_pr_tree (rpn, stdout);
-                    printf ("\n");
+                    ccl_stop_words_t csw = ccl_stop_words_create();
+                    int idx = 0;
+                    printf("First:\n");
+                    ccl_pr_tree(rpn, stdout);
+                    if (ccl_stop_words_tree(csw, bibset, &rpn))
+                    {
+                        printf("Second:\n");
+                        ccl_pr_tree(rpn, stdout);
+                        printf("\n");
+                        
+                        for (idx = 0; ; idx++)
+                        {
+                            const char *qname;
+                            const char *term;
+                            if (!ccl_stop_words_info(csw, idx,
+                                                     &qname, &term))
+                                break;
+                            printf("Removed from %s: %s\n", 
+                                   qname ? qname : "none", term);
+                        }
+                    }
+                    ccl_stop_words_destroy(csw);
                 }
             }
-            ccl_parser_destroy (cclp);
+            ccl_parser_destroy(cclp);
             if (rpn)
                 ccl_rpn_delete(rpn);
         }
     }
-    printf ("\n");
+    printf("\n");
     ccl_qual_rm(&bibset);
     return 0;
 }