Allow multiple ICU chains for facets
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 13 Sep 2011 13:25:52 +0000 (15:25 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 13 Sep 2011 13:25:52 +0000 (15:25 +0200)
The charsets ICU interface can keep any number of identified
ICU chains where "relevance", "sort", "mergekey", and "facet" are
just the existing ones. The elements in server/service for
defining ICU chains relevance, sort, mergekey and facet are deprecated
and may be replaced by <icu_chain id="id" >..</icu_chain>.

src/charsets.c
src/charsets.h
src/client.c
src/pazpar2_config.c
src/pazpar2_config.h
src/relevance.c
src/relevance.h
src/session.c
test/test_icu.cfg
test/test_icu_8.res

index 44ef5fc..ba5d426 100644 (file)
@@ -40,12 +40,18 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <yaz/icu.h>
 #endif
 
+typedef struct pp2_charset_s *pp2_charset_t;
+static pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node);
+static pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn);
+static pp2_charset_t pp2_charset_create_a_to_z(void);
+static void pp2_charset_destroy(pp2_charset_t pct);
+static pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct);
+
 /* charset handle */
 struct pp2_charset_s {
     const char *(*token_next_handler)(pp2_relevance_token_t prt);
     const char *(*get_sort_handler)(pp2_relevance_token_t prt);
     const char *(*get_display_handler)(pp2_relevance_token_t prt);
-    int ref_count;
 #if YAZ_HAVE_ICU
     struct icu_chain * icu_chn;
     UErrorCode icu_sts;
@@ -75,14 +81,114 @@ struct pp2_relevance_token_s {
 #endif
 };
 
+struct pp2_charset_fact_s {
+    struct pp2_charset_entry *list;
+    int ref_count;
+};
+
+struct pp2_charset_entry {
+    struct pp2_charset_entry *next;
+    pp2_charset_t pct;
+    char *name;
+};
+
+
+static int pp2_charset_fact_add(pp2_charset_fact_t pft,
+                                pp2_charset_t pct, const char *default_id);
+
+pp2_charset_fact_t pp2_charset_fact_create(void)
+{
+    pp2_charset_fact_t pft = xmalloc(sizeof(*pft));
+    pft->list = 0;
+    pft->ref_count = 1;
+
+    pp2_charset_fact_add(pft, pp2_charset_create_a_to_z(), "relevance");
+    pp2_charset_fact_add(pft, pp2_charset_create_a_to_z(), "sort");
+    pp2_charset_fact_add(pft, pp2_charset_create_a_to_z(), "mergekey");
+    pp2_charset_fact_add(pft, pp2_charset_create(0), "facet");
+    return pft;
+}
+
+void pp2_charset_fact_destroy(pp2_charset_fact_t pft)
+{
+    if (pft)
+    {
+        assert(pft->ref_count >= 1);
+        --(pft->ref_count);
+        if (pft->ref_count == 0)
+        {
+            struct pp2_charset_entry *pce = pft->list;
+            while (pce)
+            {
+                struct pp2_charset_entry *next = pce->next;
+                pp2_charset_destroy(pce->pct);
+                xfree(pce->name);
+                xfree(pce);
+                pce = next;
+            }
+            xfree(pft);
+        }
+    }
+}
+
+int pp2_charset_fact_add(pp2_charset_fact_t pft,
+                         pp2_charset_t pct, const char *default_id)
+{
+    struct pp2_charset_entry *pce;
+
+    for (pce = pft->list; pce; pce = pce->next)
+        if (!strcmp(default_id, pce->name))
+            break;
+
+    if (!pce)
+    {
+        pce = xmalloc(sizeof(*pce));
+        pce->name = xstrdup(default_id);
+        pce->next = pft->list;
+        pft->list = pce;
+    }
+    else
+    {
+        pp2_charset_destroy(pce->pct);
+    }
+    pce->pct = pct;
+    return 0;
+}
+
+int pp2_charset_fact_define(pp2_charset_fact_t pft,
+                            xmlNode *xml_node, const char *default_id)
+{
+    int r;
+    pp2_charset_t pct;
+    xmlChar *id;
+
+    assert(xml_node);
+    pct = pp2_charset_create_xml(xml_node);
+    if (!pct)
+        return -1;
+    id = xmlGetProp(xml_node, (xmlChar*) "id");
+    if (id)
+        default_id = (const char *) id;
+    if (!default_id)
+    {
+        pp2_charset_destroy(pct);
+        return -1;
+    }
+    r = pp2_charset_fact_add(pft, pct, default_id);
+    xmlFree(id);
+    return r;
+}
+
+void pp2_charset_fact_incref(pp2_charset_fact_t pft)
+{
+    (pft->ref_count)++;
+}
 
 pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node)
 {
 #if YAZ_HAVE_ICU
     UErrorCode status = U_ZERO_ERROR;
     struct icu_chain *chain = 0;
-    if (xml_node)
-        xml_node = xml_node->children;
     while (xml_node && xml_node->type != XML_ELEMENT_NODE)
         xml_node = xml_node->next;
     chain = icu_chain_xml_config(xml_node, 1, &status);
@@ -108,11 +214,6 @@ pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node)
 #endif // YAZ_HAVE_ICU
 }
 
-void pp2_charset_incref(pp2_charset_t pct)
-{
-    (pct->ref_count)++;
-}
-
 pp2_charset_t pp2_charset_create_a_to_z(void)
 {
     pp2_charset_t pct = pp2_charset_create(0);
@@ -127,7 +228,6 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn)
     pct->token_next_handler = pp2_relevance_token_null;
     pct->get_sort_handler  = pp2_get_sort_ascii;
     pct->get_display_handler  = pp2_get_display_ascii;
-    pct->ref_count = 1;
 #if YAZ_HAVE_ICU
     pct->icu_chn = 0;
     if (icu_chn)
@@ -144,18 +244,20 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn)
 
 void pp2_charset_destroy(pp2_charset_t pct)
 {
-    if (pct)
-    {
-        assert(pct->ref_count >= 1);
-        --(pct->ref_count);
-        if (pct->ref_count == 0)
-        {
 #if YAZ_HAVE_ICU
-            icu_chain_destroy(pct->icu_chn);
+    icu_chain_destroy(pct->icu_chn);
 #endif
-            xfree(pct);
-        }
-    }
+    xfree(pct);
+}
+
+pp2_relevance_token_t pp2_relevance_create(pp2_charset_fact_t pft,
+                                           const char *id)
+{
+    struct pp2_charset_entry *pce;
+    for (pce = pft->list; pce; pce = pce->next)
+        if (!strcmp(id, pce->name))
+            return pp2_relevance_tokenize(pce->pct);
+    return 0;
 }
 
 pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct)
index 4efb3f0..cc9f269 100644 (file)
@@ -27,19 +27,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <yaz/wrbuf.h>
 #include <yaz/xmltypes.h>
 
-struct icu_chain;
-
-typedef struct pp2_charset_s *pp2_charset_t;
 typedef struct pp2_relevance_token_s *pp2_relevance_token_t;
+typedef struct pp2_charset_fact_s *pp2_charset_fact_t;
 
-pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node);
-pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn);
-pp2_charset_t pp2_charset_create_a_to_z(void);
-
-void pp2_charset_destroy(pp2_charset_t pct);
-void pp2_charset_incref(pp2_charset_t pct);
-
-pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct);
 void pp2_relevance_first(pp2_relevance_token_t prt,
                          const char *buf,
                          int skip_article);
@@ -49,6 +39,13 @@ const char *pp2_relevance_token_next(pp2_relevance_token_t prt);
 const char *pp2_get_sort(pp2_relevance_token_t prt);
 const char *pp2_get_display(pp2_relevance_token_t prt);
 
+pp2_charset_fact_t pp2_charset_fact_create(void);
+void pp2_charset_fact_destroy(pp2_charset_fact_t pft);
+int pp2_charset_fact_define(pp2_charset_fact_t pft,
+                            xmlNode *xml_node, const char *default_id);
+pp2_relevance_token_t pp2_relevance_create(pp2_charset_fact_t pft,
+                                           const char *id);
+void pp2_charset_fact_incref(pp2_charset_fact_t pft);
 #endif
 
 /*
index c8945f8..6bc8e4f 100644 (file)
@@ -1085,7 +1085,7 @@ int client_parse_query(struct client *cl, const char *query,
         char *p[512];
         extract_terms(se->nmem, cn, p);
         se->relevance = relevance_create(
-            se->service->relevance_pct,
+            se->service->charsets,
             se->nmem, (const char **) p);
     }
 
index f42cfae..df050c7 100644 (file)
@@ -126,10 +126,7 @@ static struct conf_service *service_init(struct conf_server *server,
     service->z3950_session_timeout = 180;
     service->z3950_operation_timeout = 30;
 
-    service->relevance_pct = 0;
-    service->sort_pct = 0;
-    service->mergekey_pct = 0;
-    service->facet_pct = 0;
+    service->charsets = 0;
 
     service->id = service_id ? nmem_strdup(nmem, service_id) : 0;
     service->num_metadata = num_metadata;
@@ -246,10 +243,7 @@ void service_destroy(struct conf_service *service)
     {
         if (!pazpar2_decref(&service->ref_count, service->mutex))
         {
-            pp2_charset_destroy(service->relevance_pct);
-            pp2_charset_destroy(service->sort_pct);
-            pp2_charset_destroy(service->mergekey_pct);
-            pp2_charset_destroy(service->facet_pct);
+            pp2_charset_fact_destroy(service->charsets);
             yaz_mutex_destroy(&service->mutex);
             nmem_destroy(service->nmem);
         }
@@ -531,61 +525,30 @@ static struct conf_service *service_create_static(struct conf_server *server,
         }
         else if (!strcmp((const char *) n->name, "settings"))
             got_settings++;
-        else if (!strcmp((const char *) n->name, "relevance"))
+        else if (!strcmp((const char *) n->name, "icu_chain"))
         {
-            if (service->relevance_pct)
+            if (!service->charsets)
+                service->charsets = pp2_charset_fact_create();
+            if (pp2_charset_fact_define(service->charsets, n, 0))
             {
-                yaz_log(YLOG_LOG, "relevance may not repeat in service");
+                yaz_log(YLOG_FATAL, "ICU chain definition error");
                 return 0;
             }
-            else
-            {
-                service->relevance_pct = pp2_charset_create_xml(n);
-                if (!service->relevance_pct)
-                    return 0;
-            }
-        }
-        else if (!strcmp((const char *) n->name, "sort"))
-        {
-            if (service->sort_pct)
-            {
-                yaz_log(YLOG_LOG, "sort may not repeat in service");
-                return 0;
-            }
-            else
-            {
-                service->sort_pct = pp2_charset_create_xml(n);
-                if (!service->sort_pct)
-                    return 0;
-            }
         }
-        else if (!strcmp((const char *) n->name, "mergekey"))
-        {
-            if (service->mergekey_pct)
-            {
-                yaz_log(YLOG_LOG, "mergekey may not repeat in service");
-                return 0;
-            }
-            else
-            {
-                service->mergekey_pct = pp2_charset_create_xml(n);
-                if (!service->mergekey_pct)
-                    return 0;
-            }
-        }
-        else if (!strcmp((const char *) n->name, "facet"))
+        else if (!strcmp((const char *) n->name, "relevance")
+                 || !strcmp((const char *) n->name, "sort")
+                 || !strcmp((const char *) n->name, "mergekey")
+                 || !strcmp((const char *) n->name, "facet"))
+
         {
-            if (service->facet_pct)
+            if (!service->charsets)
+                service->charsets = pp2_charset_fact_create();
+            if (pp2_charset_fact_define(service->charsets,
+                                        n->children, (const char *) n->name))
             {
-                yaz_log(YLOG_LOG, "facet may not repeat in service");
+                yaz_log(YLOG_FATAL, "ICU chain definition error");
                 return 0;
             }
-            else
-            {
-                service->facet_pct = pp2_charset_create_xml(n);
-                if (!service->facet_pct)
-                    return 0;
-            }
         }
         else if (!strcmp((const char *) n->name, (const char *) "metadata"))
         {
@@ -675,48 +638,17 @@ static void inherit_server_settings(struct conf_service *s)
     
     /* use relevance/sort/mergekey/facet from server if not defined
        for this service.. */
-    if (!s->relevance_pct)
+    if (!s->charsets)
     {
-        if (server->relevance_pct)
+        if (server->charsets)
         {
-            s->relevance_pct = server->relevance_pct;
-            pp2_charset_incref(s->relevance_pct);
+            s->charsets = server->charsets;
+            pp2_charset_fact_incref(s->charsets);
         }
         else
-            s->relevance_pct = pp2_charset_create_a_to_z();
-    }
-    
-    if (!s->sort_pct)
-    {
-        if (server->sort_pct)
-        {
-            s->sort_pct = server->sort_pct;
-            pp2_charset_incref(s->sort_pct);
-        }
-        else
-            s->sort_pct = pp2_charset_create_a_to_z();
-    }
-    
-    if (!s->mergekey_pct)
-    {
-        if (server->mergekey_pct)
-        {
-            s->mergekey_pct = server->mergekey_pct;
-            pp2_charset_incref(s->mergekey_pct);
-        }
-        else
-            s->mergekey_pct = pp2_charset_create_a_to_z();
-    }
-
-    if (!s->facet_pct)
-    {
-        if (server->facet_pct)
         {
-            s->facet_pct = server->facet_pct;
-            pp2_charset_incref(s->facet_pct);
+            s->charsets = pp2_charset_fact_create();
         }
-        else
-            s->facet_pct = pp2_charset_create(0);
     }
 }
 
@@ -750,10 +682,7 @@ static struct conf_server *server_create(struct conf_config *config,
     server->service = 0;
     server->config = config;
     server->next = 0;
-    server->relevance_pct = 0;
-    server->sort_pct = 0;
-    server->mergekey_pct = 0;
-    server->facet_pct = 0;
+    server->charsets = 0;
     server->server_settings = 0;
     server->http_server = 0;
     server->iochan_man = 0;
@@ -806,30 +735,30 @@ static struct conf_server *server_create(struct conf_config *config,
             if (!(server->server_settings = parse_settings(config, nmem, n)))
                 return 0;
         }
-        else if (!strcmp((const char *) n->name, "relevance"))
+        else if (!strcmp((const char *) n->name, "icu_chain"))
         {
-            server->relevance_pct = pp2_charset_create_xml(n);
-            if (!server->relevance_pct)
-                return 0;
-        }
-        else if (!strcmp((const char *) n->name, "sort"))
-        {
-            server->sort_pct = pp2_charset_create_xml(n);
-            if (!server->sort_pct)
-                return 0;
-        }
-        else if (!strcmp((const char *) n->name, "mergekey"))
-        {
-            server->mergekey_pct = pp2_charset_create_xml(n);
-            if (!server->mergekey_pct)
+            if (!server->charsets)
+                server->charsets = pp2_charset_fact_create();
+            if (pp2_charset_fact_define(server->charsets, n, 0))
+            {
+                yaz_log(YLOG_FATAL, "ICU chain definition error");
                 return 0;
+            }
         }
-        else if (!strcmp((const char *) n->name, "facet"))
+        else if (!strcmp((const char *) n->name, "relevance")
+                 || !strcmp((const char *) n->name, "sort")
+                 || !strcmp((const char *) n->name, "mergekey")
+                 || !strcmp((const char *) n->name, "facet"))
         {
-            server->facet_pct = pp2_charset_create_xml(n);
-            if (!server->facet_pct)
+            if (!server->charsets)
+                server->charsets = pp2_charset_fact_create();
+            if (pp2_charset_fact_define(server->charsets,
+                                        n->children, (const char *) n->name))
+            {
+                yaz_log(YLOG_FATAL, "ICU chain definition error");
                 return 0;
-        }
+            }            
+        }            
         else if (!strcmp((const char *) n->name, "service"))
         {
             char *service_id = (char *)
@@ -1033,10 +962,7 @@ void server_destroy(struct conf_server *server)
         service_destroy(s);
         s = s_next;
     }
-    pp2_charset_destroy(server->relevance_pct);
-    pp2_charset_destroy(server->sort_pct);
-    pp2_charset_destroy(server->mergekey_pct);
-    pp2_charset_destroy(server->facet_pct);
+    pp2_charset_fact_destroy(server->charsets);
     yaz_log(YLOG_LOG, "server_destroy server=%p", server);
     http_server_destroy(server->http_server);
 }
index f6cf6d0..8a1ae06 100644 (file)
@@ -117,10 +117,7 @@ struct conf_service
 
     int ref_count;
     /* duplicated from conf_server */
-    pp2_charset_t relevance_pct;
-    pp2_charset_t sort_pct;
-    pp2_charset_t mergekey_pct;
-    pp2_charset_t facet_pct;
+    pp2_charset_fact_t charsets;
 
     struct database *databases;
     struct conf_server *server;
@@ -140,10 +137,7 @@ struct conf_server
     char *server_settings;
     char *server_id;
 
-    pp2_charset_t relevance_pct;
-    pp2_charset_t sort_pct;
-    pp2_charset_t mergekey_pct;
-    pp2_charset_t facet_pct;
+    pp2_charset_fact_t charsets;
 
     struct conf_service *service;
     struct conf_server *next;
index 680d8f6..4df7750 100644 (file)
@@ -120,7 +120,7 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
     cluster->term_frequency_vec[0] += length;
 }
 
-struct relevance *relevance_create(pp2_charset_t pct,
+struct relevance *relevance_create(pp2_charset_fact_t pft,
                                    NMEM nmem, const char **terms)
 {
     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
@@ -133,7 +133,7 @@ struct relevance *relevance_create(pp2_charset_t pct,
     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
     res->nmem = nmem;
-    res->prt = pp2_relevance_tokenize(pct);
+    res->prt = pp2_relevance_create(pft, "relevance");
     res->entries = build_word_entries(res->prt, nmem, terms);
     return res;
 }
index cb82601..e357382 100644 (file)
@@ -27,7 +27,7 @@ struct relevance;
 struct record_cluster;
 struct reclist;
 
-struct relevance *relevance_create(pp2_charset_t pct,
+struct relevance *relevance_create(pp2_charset_fact_t pft,
                                    NMEM nmem, const char **terms);
 void relevance_destroy(struct relevance **rp);
 void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
index 9b69b38..fa9ebb3 100644 (file)
@@ -200,8 +200,17 @@ void add_facet(struct session *s, const char *type, const char *value, int count
             icu_chain_id = (service->metadata + i)->icu_chain;
     yaz_log(YLOG_LOG, "icu_chain id=%s", icu_chain_id ? icu_chain_id : "null");
 
-    prt = pp2_relevance_tokenize(service->facet_pct);
-    
+    if (!icu_chain_id)
+        icu_chain_id = "facet";
+    prt = pp2_relevance_create(service->charsets, icu_chain_id);
+    if (!prt)
+    {
+        yaz_log(YLOG_FATAL, "Unknown ICU chain '%s' for facet of type '%s'",
+                icu_chain_id, type);
+        wrbuf_destroy(facet_wrbuf);
+        wrbuf_destroy(display_wrbuf);
+        return;
+    }
     pp2_relevance_first(prt, value, 0);
     while ((facet_component = pp2_relevance_token_next(prt)))
     {
@@ -236,6 +245,7 @@ void add_facet(struct session *s, const char *type, const char *value, int count
             {
                 session_log(s, YLOG_FATAL, "Too many termlists");
                 wrbuf_destroy(facet_wrbuf);
+                wrbuf_destroy(display_wrbuf);
                 return;
             }
             
@@ -1126,7 +1136,7 @@ static int get_mergekey_from_doc(xmlDoc *doc, xmlNode *root, const char *name,
                 {
                     const char *norm_str;
                     pp2_relevance_token_t prt =
-                        pp2_relevance_tokenize(service->mergekey_pct);
+                        pp2_relevance_create(service->charsets, "mergekey");
                     
                     pp2_relevance_first(prt, (const char *) value, 0);
                     if (wrbuf_len(norm_wr) > 0)
@@ -1165,7 +1175,7 @@ static const char *get_mergekey(xmlDoc *doc, struct client *cl, int record_no,
     {
         const char *norm_str;
         pp2_relevance_token_t prt =
-            pp2_relevance_tokenize(service->mergekey_pct);
+            pp2_relevance_create(service->charsets, "mergekey");
 
         pp2_relevance_first(prt, (const char *) mergekey, 0);
         while ((norm_str = pp2_relevance_token_next(prt)))
@@ -1472,7 +1482,7 @@ static int ingest_to_cluster(struct client *cl,
                                 nmem_malloc(se->nmem, 
                                             sizeof(union data_types));
                          
-                        prt = pp2_relevance_tokenize(service->sort_pct);
+                        prt = pp2_relevance_create(service->charsets, "sort");
 
                         pp2_relevance_first(prt, rec_md->data.text.disp,
                                             skip_article);
index 08dd6de..f24007e 100644 (file)
       </icu_chain>
     </sort>
     
-    <mergekey>
-      <icu_chain locale="en">
-       <tokenize rule="l"/>
-       <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]] Remove"/>
-       <casemap rule="l"/>
-      </icu_chain>
-    </mergekey>
+    <icu_chain id="mergekey" locale="en">
+      <tokenize rule="l"/>
+      <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]] Remove"/>
+      <casemap rule="l"/>
+    </icu_chain>
     
-    <facet>
-      <icu_chain locale="en">
-       <transform rule="Title"/>
-      </icu_chain>
-    </facet>
+    <icu_chain id="facet" locale="en">
+      <transform rule="Title"/>
+    </icu_chain>
+
+    <icu_chain id="mychain" locale="en">
+      <display/>
+      <transform rule="Title"/>
+    </icu_chain>
     
     <service>
       <timeout session="30" z3950_operation="20" z3950_session="40"/>
@@ -45,7 +46,8 @@
       <metadata name="isbn"/>
       <metadata name="date" brief="yes" sortkey="numeric" type="year" merge="range"
                termlist="yes"/>
-      <metadata name="author" brief="yes" termlist="yes" merge="longest" rank="2"/>
+      <metadata name="author" brief="yes" termlist="yes"
+                merge="longest" rank="2" icu_chain="mychain"/>
       <metadata name="subject" merge="unique" termlist="yes" rank="3"/>
       <metadata name="id"/>
       <metadata name="lccn" merge="unique"/>
index 2f49a59..a200b10 100644 (file)
@@ -2,9 +2,9 @@
 <activeclients>0</activeclients>
 <list name="author">
 <term><name>Jack Collins</name><frequency>2</frequency></term>
-<term><name>Mairs, John W</name><frequency>1</frequency></term>
-<term><name>Wood, Helen M</name><frequency>1</frequency></term>
-<term><name>Englund, Carl R</name><frequency>1</frequency></term>
+<term><name>Mairs, John W.</name><frequency>1</frequency></term>
+<term><name>Wood, Helen M.</name><frequency>1</frequency></term>
+<term><name>Englund, Carl R.</name><frequency>1</frequency></term>
 </list>
 <list name="subject">
 <term><name>Radioisotope Scanning</name><frequency>1</frequency></term>