encapsulated the private info in opaque ptr ccl_stop_words_t.
/*
* CCL - header file
*
- * $Id: ccl.h,v 1.28 2007-04-30 19:55:39 adam Exp $
+ * $Id: ccl.h,v 1.29 2007-05-01 12:22:10 adam Exp $
*
* Old Europagate Log:
*
/** \brief Attributes + Term */
struct {
char *term;
+ char *qual;
struct ccl_rpn_attr *attr_list;
} t;
/** Result set */
void ccl_add_attr_string(struct ccl_rpn_node *p, const char *set,
int type, char *value);
+YAZ_EXPORT
+int ccl_search_stop(CCL_bibset bibset, const char *qname,
+ const char *src_str, size_t src_len);
+
+
+/** \brief stop words handle (pimpl) */
+typedef struct ccl_stop_words *ccl_stop_words_t;
+
+/** \brief creates stop words handle */
+YAZ_EXPORT
+ccl_stop_words_t ccl_stop_words_create(void);
+
+/** \brief destroys stop words handle */
+YAZ_EXPORT
+void ccl_stop_words_destroy(ccl_stop_words_t csw);
+
+/** \brief removes stop words from RPN tree */
+YAZ_EXPORT
+int ccl_stop_words_tree(ccl_stop_words_t csw,
+ CCL_bibset bibset, struct ccl_rpn_node **t);
+
+/** \brief returns information about removed "stop" words */
+YAZ_EXPORT
+int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
+ const char **qualname, const char **term);
#ifndef ccl_assert
#define ccl_assert(x) ;
## This file is part of the YAZ toolkit.
## Copyright (C) 1995-2007, Index Data, All rights reserved.
-## $Id: Makefile.am,v 1.66 2007-04-30 08:29:07 adam Exp $
+## $Id: Makefile.am,v 1.67 2007-05-01 12:22:11 adam Exp $
YAZ_VERSION_INFO=3:0:0
zoom-c.c zoom-socket.c zoom-opt.c zoom-p.h \
grs1disp.c zgdu.c soap.c srw.c srwutil.c \
opacdisp.c cclfind.c ccltoken.c cclerrms.c cclqual.c cclptree.c cclp.h \
- cclqfile.c cclstr.c cclxmlconfig.c \
+ cclqfile.c cclstr.c cclxmlconfig.c ccl_stop_words.c \
cql.y cqlstdio.c cqltransform.c cqlutil.c xcqlutil.c cqlstring.c \
cqlstrer.c querytowrbuf.c \
tcpdchk.c \
--- /dev/null
+/*
+ * Copyright (c) 1995, the EUROPAGATE consortium (see below).
+ *
+ * The EUROPAGATE consortium members are:
+ *
+ * University College Dublin
+ * Danmarks Teknologiske Videnscenter
+ * An Chomhairle Leabharlanna
+ * Consejo Superior de Investigaciones Cientificas
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and
+ * its documentation, in whole or in part, for any purpose, is hereby granted,
+ * provided that:
+ *
+ * 1. This copyright and permission notice appear in all copies of the
+ * software and its documentation. Notices of copyright or attribution
+ * which appear at the beginning of any file must remain unchanged.
+ *
+ * 2. The names of EUROPAGATE or the project partners may not be used to
+ * endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * 3. Users of this software (implementors and gateway operators) agree to
+ * inform the EUROPAGATE consortium of their use of the software. This
+ * information will be used to evaluate the EUROPAGATE project and the
+ * software, and to plan further developments. The consortium may use
+ * the information in later publications.
+ *
+ * 4. Users of this software agree to make their best efforts, when
+ * documenting their use of the software, to acknowledge the EUROPAGATE
+ * consortium, and the role played by the software in their work.
+ *
+ * THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED, OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL THE EUROPAGATE CONSORTIUM OR ITS MEMBERS BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF
+ * ANY KIND, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+ * OR PROFITS, WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND
+ * ON ANY THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE
+ * USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+/**
+ * \file ccl_stop_words.c
+ * \brief Removes stop words from terms in RPN tree
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <yaz/ccl.h>
+#include <yaz/nmem.h>
+
+struct ccl_stop_info {
+ char *qualname;
+ char *term;
+ struct ccl_stop_info *next;
+};
+
+struct ccl_stop_words {
+ char *blank_chars;
+ NMEM nmem; /* memory for removed items */
+ struct ccl_stop_info *removed_items;
+};
+
+static void append_removed_item(ccl_stop_words_t csw,
+ const char *qname,
+ const char *t, size_t len)
+{
+ struct ccl_stop_info *csi = nmem_malloc(csw->nmem, sizeof(*csi));
+ struct ccl_stop_info **csip = &csw->removed_items;
+ if (qname)
+ csi->qualname = nmem_strdup(csw->nmem, qname);
+ else
+ csi->qualname = 0;
+
+ csi->term = nmem_malloc(csw->nmem, len+1);
+ memcpy(csi->term, t, len);
+ csi->term[len] = '\0';
+ csi->next = 0;
+
+ while (*csip)
+ csip = &(*csip)->next;
+
+ *csip = csi;
+}
+
+ccl_stop_words_t ccl_stop_words_create(void)
+{
+ NMEM nmem = nmem_create();
+ ccl_stop_words_t csw = xmalloc(sizeof(*csw));
+ csw->nmem = nmem;
+ csw->removed_items = 0;
+ csw->blank_chars = xstrdup(" \r\n\t");
+ return csw;
+}
+
+void ccl_stop_words_destroy(ccl_stop_words_t csw)
+{
+ if (csw)
+ {
+ nmem_destroy(csw->nmem);
+ xfree(csw->blank_chars);
+ xfree(csw);
+ }
+}
+
+struct ccl_rpn_node *ccl_remove_stop_r(ccl_stop_words_t csw,
+ CCL_bibset bibset,
+ struct ccl_rpn_node *p)
+{
+ struct ccl_rpn_node *left, *right;
+ switch (p->kind)
+ {
+ case CCL_RPN_AND:
+ case CCL_RPN_OR:
+ case CCL_RPN_NOT:
+ case CCL_RPN_PROX:
+ left = ccl_remove_stop_r(csw, bibset, p->u.p[0]);
+ right = ccl_remove_stop_r(csw, bibset, p->u.p[1]);
+ if (!left || !right)
+ {
+ /* we must delete our binary node and return child (if any) */
+ p->u.p[0] = 0;
+ p->u.p[1] = 0;
+ ccl_rpn_delete(p);
+ if (left)
+ return left;
+ else
+ return right;
+ }
+ break;
+ case CCL_RPN_SET:
+ break;
+ case CCL_RPN_TERM:
+ if (p->u.t.term)
+ {
+ int found = 1;
+ while (found)
+ {
+ char *cp = p->u.t.term;
+ found = 0;
+ while (1)
+ {
+ while (*cp && strchr(csw->blank_chars, *cp))
+ cp++;
+ if (!*cp)
+ break;
+ else
+ {
+ char *cp0 = cp;
+ while (*cp && !strchr(csw->blank_chars, *cp))
+ cp++;
+ if (cp != cp0)
+ {
+ size_t len = cp - cp0;
+ if (ccl_search_stop(bibset, p->u.t.qual,
+ cp0, len))
+ {
+ append_removed_item(csw, p->u.t.qual,
+ cp0, len);
+ while (*cp && strchr(csw->blank_chars, *cp))
+ cp++;
+ memmove(cp0, cp, strlen(cp)+1);
+ found = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ /* chop right blanks .. and see if term it gets empty */
+ if (p->u.t.term && csw->removed_items)
+ {
+ char *cp = p->u.t.term + strlen(p->u.t.term);
+ while (1)
+ {
+ if (cp == p->u.t.term)
+ {
+ /* term is empty / blank */
+ ccl_rpn_delete(p);
+ return 0;
+ }
+ if (!strchr(csw->blank_chars, cp[-1]))
+ break;
+ /* chop right */
+ cp[-1] = 0;
+ --cp;
+ }
+ }
+ break;
+ }
+ return p;
+}
+
+int ccl_stop_words_tree(ccl_stop_words_t csw,
+ CCL_bibset bibset, struct ccl_rpn_node **t)
+{
+ struct ccl_rpn_node *r;
+
+ /* remove list items */
+ nmem_reset(csw->nmem);
+ csw->removed_items = 0;
+
+ r = ccl_remove_stop_r(csw, bibset, *t);
+ *t = r;
+ if (csw->removed_items)
+ return 1;
+ return 0;
+}
+
+int ccl_stop_words_info(ccl_stop_words_t csw, int idx,
+ const char **qualname, const char **term)
+{
+ struct ccl_stop_info *csi = csw->removed_items;
+ int i = 0;
+ while (csi && i < idx)
+ {
+ csi = csi->next;
+ i++;
+ }
+ if (csi)
+ {
+ *qualname = csi->qualname;
+ *term = csi->term;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+
/* CCL find (to rpn conversion)
* Europagate, 1995
*
- * $Id: cclfind.c,v 1.13 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclfind.c,v 1.14 2007-05-01 12:22:11 adam Exp $
*
* Old Europagate log:
*
case CCL_RPN_TERM:
p->u.t.attr_list = 0;
p->u.t.term = 0;
+ p->u.t.qual = 0;
break;
default:
break;
break;
case CCL_RPN_TERM:
xfree(rpn->u.t.term);
+ xfree(rpn->u.t.qual);
for (attr = rpn->u.t.attr_list; attr; attr = attr1)
{
attr1 = attr->next;
p = ccl_rpn_node_create(CCL_RPN_TERM);
p->u.t.attr_list = NULL;
p->u.t.term = NULL;
+ if (qa && qa[0])
+ {
+ const char *n = ccl_qual_get_name(qa[0]);
+ if (n)
+ p->u.t.qual = xstrdup(n);
+ }
/* go through all attributes and add them to the attribute list */
for (i=0; qa && qa[i]; i++)
}
if (i == no-1 && right_trunc)
src_len--;
- if (!ccl_qual_match_stop(cclp->bibset, qa, src_str, src_len))
+ if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
{
-#if 0
- fprintf(stderr, "[%s %.*s]",
- ccl_qual_get_name(qa[0]), src_len, src_str);
-#endif
- if (p->u.t.term[0] && cclp->look_token->ws_prefix_len)
- {
- size_t len = strlen(p->u.t.term);
- memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf,
- cclp->look_token->ws_prefix_len);
- p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
- }
- strxcat(p->u.t.term, src_str, src_len);
+ size_t len = strlen(p->u.t.term);
+ memcpy(p->u.t.term + len, cclp->look_token->ws_prefix_buf,
+ cclp->look_token->ws_prefix_len);
+ p->u.t.term[len + cclp->look_token->ws_prefix_len] = '\0';
}
+ strxcat(p->u.t.term, src_str, src_len);
ADVANCE;
}
- if (p->u.t.term[0] == 0)
- {
- ccl_rpn_delete(p);
- continue;
- }
-
/* make the top node point to us.. */
if (p_top)
{
ccl_token_del(list);
return p;
}
+
/*
* Local variables:
* c-basic-offset: 4
* Copyright (C) 1995-2005, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: cclp.h,v 1.4 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclp.h,v 1.5 2007-05-01 12:22:11 adam Exp $
*/
/**
YAZ_EXPORT
const char *ccl_qual_get_name(ccl_qualifier_t q);
-YAZ_EXPORT
-int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa,
- const char *src_str, size_t src_len);
-
/*
* Local variables:
* c-basic-offset: 4
/* CCL qualifiers
* Europagate, 1995
*
- * $Id: cclqual.c,v 1.9 2007-04-30 19:55:40 adam Exp $
+ * $Id: cclqual.c,v 1.10 2007-05-01 12:22:11 adam Exp $
*
* Old Europagate Log:
*
return 0;
}
-int ccl_qual_match_stop(CCL_bibset bibset, ccl_qualifier_t *qa,
- const char *src_str, size_t src_len)
+int ccl_search_stop(CCL_bibset bibset, const char *qname,
+ const char *src_str, size_t src_len)
{
- if (qa[0])
+ const char **slist = 0;
+ if (qname)
{
- char qname[80];
- const char **slist;
- yaz_snprintf(qname, sizeof(qname)-1, "stop.%s",
- ccl_qual_get_name(qa[0]));
- slist = ccl_qual_search_special(bibset, qname);
- if (!slist)
- slist = ccl_qual_search_special(bibset, "stop.*");
- if (slist)
- {
- int i;
- for (i = 0; slist[i]; i++)
- if (src_len == strlen(slist[i])
- && ccl_memicmp(slist[i], src_str, src_len) == 0)
- return 1;
- }
+ char qname_buf[80];
+ yaz_snprintf(qname_buf, sizeof(qname_buf)-1, "stop.%s",
+ qname);
+ slist = ccl_qual_search_special(bibset, qname_buf);
+ }
+ if (!slist)
+ slist = ccl_qual_search_special(bibset, "stop.*");
+ if (slist)
+ {
+ int i;
+ for (i = 0; slist[i]; i++)
+ if (src_len == strlen(slist[i])
+ && ccl_memicmp(slist[i], src_str, src_len) == 0)
+ return 1;
}
return 0;
}
-
/*
* Local variables:
* c-basic-offset: 4
/* CCL shell.
* Europagate 1995
*
- * $Id: cclsh.c,v 1.7 2007-04-30 19:50:22 adam Exp $
+ * $Id: cclsh.c,v 1.8 2007-05-01 12:22:11 adam Exp $
*
* Old Europagate Log:
*
void usage(const char *prog)
{
- fprintf (stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog);
- exit (1);
+ fprintf(stderr, "%s: [-d] [-b configfile] [-x xmlconfig]\n", prog);
+ exit(1);
}
-int main (int argc, char **argv)
+int main(int argc, char **argv)
{
CCL_bibset bibset;
FILE *bib_inf;
WRBUF q_wrbuf = 0;
prog = *argv;
- bibset = ccl_qual_mk ();
+ bibset = ccl_qual_mk();
while ((ret = options("db:x:", argv, argc, &arg)) != -2)
{
break;
case 'b':
bib_fname = arg;
- bib_inf = fopen (bib_fname, "r");
+ bib_inf = fopen(bib_fname, "r");
if (!bib_inf)
{
- fprintf (stderr, "%s: cannot open %s\n", prog,
+ fprintf(stderr, "%s: cannot open %s\n", prog,
bib_fname);
- exit (1);
+ exit(1);
}
- ccl_qual_file (bibset, bib_inf);
- fclose (bib_inf);
+ ccl_qual_file(bibset, bib_inf);
+ fclose(bib_inf);
break;
#if YAZ_HAVE_XML2
case 'x':
int error;
struct ccl_rpn_node *rpn;
- rpn = ccl_parser_find_str (cclp, wrbuf_cstr(q_wrbuf));
+ rpn = ccl_parser_find_str(cclp, wrbuf_cstr(q_wrbuf));
error = ccl_parser_get_error(cclp, 0);
if (error)
{
- printf ("%s\n", ccl_err_msg (error));
+ printf("%s\n", ccl_err_msg(error));
}
else
{
if (rpn)
{
- ccl_pr_tree (rpn, stdout);
- printf ("\n");
+ ccl_pr_tree(rpn, stdout);
+ printf("\n");
}
}
- ccl_parser_destroy (cclp);
+ ccl_parser_destroy(cclp);
if (rpn)
ccl_rpn_delete(rpn);
wrbuf_destroy(q_wrbuf);
break;
}
strcpy(buf,line_in);
- free (line_in);
+ free(line_in);
#else
- printf ("CCLSH>"); fflush (stdout);
- if (!fgets (buf, 999, stdin))
+ printf("CCLSH>"); fflush(stdout);
+ if (!fgets(buf, 999, stdin))
break;
#endif
if (error)
{
- printf ("%*s^ - ", 6+pos, " ");
- printf ("%s\n", ccl_err_msg (error));
+ printf("%*s^ - ", 6+pos, " ");
+ printf("%s\n", ccl_err_msg(error));
}
else
{
if (rpn && i == 0)
{
- ccl_pr_tree (rpn, stdout);
- printf ("\n");
+ ccl_stop_words_t csw = ccl_stop_words_create();
+ int idx = 0;
+ printf("First:\n");
+ ccl_pr_tree(rpn, stdout);
+ if (ccl_stop_words_tree(csw, bibset, &rpn))
+ {
+ printf("Second:\n");
+ ccl_pr_tree(rpn, stdout);
+ printf("\n");
+
+ for (idx = 0; ; idx++)
+ {
+ const char *qname;
+ const char *term;
+ if (!ccl_stop_words_info(csw, idx,
+ &qname, &term))
+ break;
+ printf("Removed from %s: %s\n",
+ qname ? qname : "none", term);
+ }
+ }
+ ccl_stop_words_destroy(csw);
}
}
- ccl_parser_destroy (cclp);
+ ccl_parser_destroy(cclp);
if (rpn)
ccl_rpn_delete(rpn);
}
}
- printf ("\n");
+ printf("\n");
ccl_qual_rm(&bibset);
return 0;
}