From: Adam Dickmeiss Date: Mon, 10 Sep 2007 16:25:49 +0000 (+0000) Subject: Implemented sorting using ICU. Bug #1190. The Debian package now X-Git-Tag: PAZPAR2.1.0.3~11 X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=d7dc14dcdfbd1ecdc805a0d649203f3b9888749c;p=pazpar2-moved-to-github.git Implemented sorting using ICU. Bug #1190. The Debian package now enables ICU by default. Added new tests for ICU enabled Pazpar2 - test skipped if ICU is not enabled. --- diff --git a/NEWS b/NEWS index 03bbd98..93c3b55 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +ICU is used for tokenization and normalization of the following: mergekey, +sorting, relevance terms. + +Debian package now enables ICU tokenization and normalization by default. + --- 1.0.2 2007/08/22 Exposed user setting values (i.e. non-pz: names) to the record systems in two diff --git a/debian/changelog b/debian/changelog index 2bd8c57..5aabe05 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +pazpar2 (1.0.2-9) unstable; urgency=low + + * ICU enabled by default for the Debian package. + * ICU for sorting. + + -- Adam Dickmeiss Mon, 10 Sep 2007 10:44:03 +0200 + pazpar2 (1.0.2-8) unstable; urgency=low * Bug fixes #1395, #1507. diff --git a/doc/pazpar2_conf.xml b/doc/pazpar2_conf.xml index 2e294bd..8eb16c2 100644 --- a/doc/pazpar2_conf.xml +++ b/doc/pazpar2_conf.xml @@ -9,7 +9,7 @@ %idcommon; ]> - + Pazpar2 @@ -103,11 +103,11 @@ - icu_chain + relevance - Definition of ICU tokenization and normalization rules - are used if ICU support is compiled in. The 'id' + Specifies ICU tokenization and normalization rules + for tokens that are used in Pazpar2's relevance ranking. The 'id' attribute is currently not used, and the 'locale' attribute must be set to one of the locale strings defined in ICU. The child elements listed below can be @@ -167,6 +167,28 @@ + + + sort + + + Specifies ICU tokenization and normalization rules + for tokens that are used in Pazpar2's sorting. The contents + is similar to that of relevance. + + + + + + mergekey + + + Specifies ICU tokenization and normalization rules + for tokens that are used in Pazpar2's mergekey. The contents + is similar to that of relevance. + + + service @@ -568,7 +590,7 @@ - + diff --git a/etc/pazpar2.cfg.dist b/etc/pazpar2.cfg.dist index 831b0de..7c497c4 100644 --- a/etc/pazpar2.cfg.dist +++ b/etc/pazpar2.cfg.dist @@ -1,11 +1,38 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/etc/pazpar2icu.cfg.dist b/etc/pazpar2icu.cfg.dist deleted file mode 100644 index ba69ec3..0000000 --- a/etc/pazpar2icu.cfg.dist +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/charsets.c b/src/charsets.c index 2d5dad3..7d41283 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -1,4 +1,4 @@ -/* $Id: charsets.c,v 1.5 2007-05-25 10:32:55 marc Exp $ +/* $Id: charsets.c,v 1.6 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -34,8 +34,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include "charsets.h" -//#include "config.h" -//#include "parameters.h" +#include "normalize7bit.h" #ifdef HAVE_ICU #include "icu_I18N.h" @@ -44,7 +43,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA /* charset handle */ struct pp2_charset_s { const char *(*token_next_handler)(pp2_relevance_token_t prt); - /* other handlers will come as we see fit */ + const char *(*get_sort_handler)(pp2_relevance_token_t prt, int skip); #ifdef HAVE_ICU struct icu_chain * icu_chn; UErrorCode icu_sts; @@ -52,31 +51,72 @@ struct pp2_charset_s { }; static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); +static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, int skip_article); #ifdef HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); +static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, int skip_article); #endif // HAVE_ICU /* tokenzier handle */ struct pp2_relevance_token_s { const char *cp; /* unnormalized buffer we're tokenizing */ + const char *last_cp; /* pointer to last token we're dealing with */ pp2_charset_t pct; /* our main charset handle (type+config) */ WRBUF norm_str; /* normized string we return (temporarily) */ + WRBUF sort_str; /* sort string we return (temporarily) */ }; + +pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) +{ +#ifdef HAVE_ICU + UErrorCode status = U_ZERO_ERROR; + while (xml_node && xml_node->type != XML_ELEMENT_NODE) + xml_node = xml_node->next; + struct icu_chain *chain = icu_chain_xml_config(xml_node, &status); + if (!chain || U_FAILURE(status)){ + //xmlDocPtr icu_doc = 0; + //xmlChar *xmlstr = 0; + //int size = 0; + //xmlDocDumpMemory(icu_doc, size); + + yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n" + "<%s>\n ... \n", + xml_node->name, xml_node->name); + return 0; + } + return pp2_charset_create(chain); +#else // HAVE_ICU + yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n" + "<%s>\n ... \n", + n->name, n->name); + yaz_log(YLOG_FATAL, + "But no ICU support compiled into pazpar2 server."); + yaz_log(YLOG_FATAL, + "Please install libicu36-dev and icu-doc or similar, " + "re-configure and re-compile"); + return 0; +#endif // HAVE_ICU +} + + pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn) { pp2_charset_t pct = xmalloc(sizeof(*pct)); pct->token_next_handler = pp2_relevance_token_a_to_z; + pct->get_sort_handler = pp2_get_sort_ascii; #ifdef HAVE_ICU pct->icu_chn = 0; - if (icu_chn){ + if (icu_chn) + { pct->icu_chn = icu_chn; pct->icu_sts = U_ZERO_ERROR; pct->token_next_handler = pp2_relevance_token_icu; + pct->get_sort_handler = pp2_get_sort_icu; } - #endif // HAVE_ICU +#endif // HAVE_ICU return pct; } @@ -93,7 +133,9 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, assert(pct); prt->norm_str = wrbuf_alloc(); + prt->sort_str = wrbuf_alloc(); prt->cp = buf; + prt->last_cp = 0; prt->pct = pct; #ifdef HAVE_ICU @@ -104,7 +146,6 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, ok = icu_chain_assign_cstr(pct->icu_chn, buf, &pct->icu_sts); //printf("\nfield ok: %d '%s'\n", ok, buf); prt->pct = pct; - prt->norm_str = 0; } #endif // HAVE_ICU return prt; @@ -116,6 +157,8 @@ void pp2_relevance_token_destroy(pp2_relevance_token_t prt) assert(prt); if(prt->norm_str) wrbuf_destroy(prt->norm_str); + if(prt->sort_str) + wrbuf_destroy(prt->sort_str); xfree(prt); } @@ -125,7 +168,12 @@ const char *pp2_relevance_token_next(pp2_relevance_token_t prt) return (prt->pct->token_next_handler)(prt); } -#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 1 : -1) +const char *pp2_get_sort(pp2_relevance_token_t prt, int skip) +{ + return prt->pct->get_sort_handler(prt, skip); +} + +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1) /* original tokenizer with our tokenize interface, but we add +1 to ensure no '\0' are in our string (except for EOF) */ @@ -140,9 +188,12 @@ static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) if (*cp == '\0') { prt->cp = cp; + prt->last_cp = 0; return 0; } /* now read the term itself */ + + prt->last_cp = cp; wrbuf_rewind(prt->norm_str); while (*cp && (c = raw_char(tolower(*cp))) >= 0) { @@ -153,24 +204,45 @@ static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) return wrbuf_cstr(prt->norm_str); } +static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, + int skip_article) +{ + if (prt->last_cp == 0) + return 0; + else + { + char *tmp = xstrdup(prt->last_cp); + char *result = 0; + result = normalize7bit_mergekey(tmp, skip_article); + + wrbuf_rewind(prt->sort_str); + wrbuf_puts(prt->sort_str, result); + xfree(tmp); + return wrbuf_cstr(prt->sort_str); + } +} + #ifdef HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) { - //&& U_SUCCESS(pct->icu_sts)) - if (icu_chain_next_token(prt->pct->icu_chn, &prt->pct->icu_sts)){ - //printf("'%s' ", icu_chain_get_norm(prt->pct->icu_chn)); + if (icu_chain_next_token(prt->pct->icu_chn, &prt->pct->icu_sts)) + { if (U_FAILURE(prt->pct->icu_sts)) { - //printf("ICU status failure\n "); return 0; } - return icu_chain_get_norm(prt->pct->icu_chn); } - //printf ("EOF\n"); return 0; -}; +} + +static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, + int skip_article) +{ + return icu_chain_get_sort(prt->pct->icu_chn); +} + #endif // HAVE_ICU diff --git a/src/charsets.h b/src/charsets.h index 9e350e7..0e61130 100644 --- a/src/charsets.h +++ b/src/charsets.h @@ -1,4 +1,4 @@ -/* $Id: charsets.h,v 1.2 2007-05-23 14:44:18 marc Exp $ +/* $Id: charsets.h,v 1.3 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -26,12 +26,15 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #ifndef PAZPAR_CHARSETS_H #define PAZPAR_CHARSETS_H +#include +#include struct icu_chain; typedef struct pp2_charset_s *pp2_charset_t; typedef struct pp2_relevance_token_s *pp2_relevance_token_t; +pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node); pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn); void pp2_charset_destroy(pp2_charset_t pct); @@ -39,6 +42,16 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, const char *buf); void pp2_relevance_token_destroy(pp2_relevance_token_t prt); const char *pp2_relevance_token_next(pp2_relevance_token_t prt); +const char *pp2_get_sort(pp2_relevance_token_t prt, int skip_article); + +#if 0 +typedef int pp2_charset_normalize_t(pp2_charset_t pct, + const char *buf, + WRBUF norm_str, WRBUF sort_str, + int skip_article); + +pp2_charset_normalize_t pp2_charset_metadata_norm; +#endif #endif diff --git a/src/client.c b/src/client.c index 25dae7a..b4306a7 100644 --- a/src/client.c +++ b/src/client.c @@ -1,4 +1,4 @@ -/* $Id: client.c,v 1.19 2007-09-05 08:40:12 adam Exp $ +/* $Id: client.c,v 1.20 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -946,9 +946,10 @@ int client_parse_query(struct client *cl, const char *query) // Initialize relevance structure with query terms char *p[512]; extract_terms(se->nmem, cn, p); - se->relevance = relevance_create(client_get_database(cl)->pct, - se->nmem, (const char **) p, - se->expected_maxrecs); + se->relevance = relevance_create( + global_parameters.server->relevance_pct, + se->nmem, (const char **) p, + se->expected_maxrecs); } ccl_rpn_delete(cn); diff --git a/src/config.c b/src/config.c index 256fa4e..359738f 100644 --- a/src/config.c +++ b/src/config.c @@ -1,4 +1,4 @@ -/* $Id: config.c,v 1.40 2007-07-30 23:16:33 quinn Exp $ +/* $Id: config.c,v 1.41 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -19,7 +19,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -/* $Id: config.c,v 1.40 2007-07-30 23:16:33 quinn Exp $ */ +/* $Id: config.c,v 1.41 2007-09-10 16:25:50 adam Exp $ */ #include @@ -430,11 +430,9 @@ static struct conf_server *parse_server(xmlNode *node) server->service = 0; server->next = 0; server->settings = 0; - -#ifdef HAVE_ICU - server->icu_chn = 0; -#endif // HAVE_ICU - + server->relevance_pct = 0; + server->sort_pct = 0; + server->mergekey_pct = 0; for (n = node->children; n; n = n->next) { @@ -483,34 +481,17 @@ static struct conf_server *parse_server(xmlNode *node) if (!(server->settings = parse_settings(n))) return 0; } - else if (!strcmp((const char *) n->name, "icu_chain")) + else if (!strcmp((const char *) n->name, "relevance")) { -#ifdef HAVE_ICU - UErrorCode status = U_ZERO_ERROR; - struct icu_chain *chain = icu_chain_xml_config(n, &status); - if (!chain || U_FAILURE(status)){ - //xmlDocPtr icu_doc = 0; - //xmlChar *xmlstr = 0; - //int size = 0; - //xmlDocDumpMemory(icu_doc, size); - - yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n" - "<%s>\n ... \n", - n->name, n->name); - return 0; - } - server->icu_chn = chain; -#else // HAVE_ICU - yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n" - "<%s>\n ... \n", - n->name, n->name); - yaz_log(YLOG_FATAL, - "But no ICU support compiled into pazpar2 server."); - yaz_log(YLOG_FATAL, - "Please install libicu36-dev and icu-doc or similar, " - "re-configure and re-compile"); - return 0; -#endif // HAVE_ICU + server->relevance_pct = pp2_charset_create_xml(n->children); + } + else if (!strcmp((const char *) n->name, "sort")) + { + server->sort_pct = pp2_charset_create_xml(n->children); + } + else if (!strcmp((const char *) n->name, "mergekey")) + { + server->mergekey_pct = pp2_charset_create_xml(n->children); } else if (!strcmp((const char *) n->name, "service")) { @@ -525,6 +506,12 @@ static struct conf_server *parse_server(xmlNode *node) return 0; } } + if (!server->relevance_pct) + server->relevance_pct = pp2_charset_create(0); + if (!server->sort_pct) + server->sort_pct = pp2_charset_create(0); + if (!server->mergekey_pct) + server->mergekey_pct = pp2_charset_create(0); return server; } diff --git a/src/config.h b/src/config.h index 1f51ee4..744cfb1 100644 --- a/src/config.h +++ b/src/config.h @@ -1,4 +1,4 @@ -/* $Id: config.h,v 1.26 2007-07-30 11:52:08 quinn Exp $ +/* $Id: config.h,v 1.27 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -27,11 +27,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include - -#ifdef HAVE_ICU -#include "icu_I18N.h" -#endif // HAVE_ICU - +#include "charsets.h" enum conf_metadata_type { Metadata_type_generic, // Generic text field @@ -159,9 +155,9 @@ struct conf_server char *myurl; char *settings; -#ifdef HAVE_ICU - struct icu_chain * icu_chn; -#endif // HAVE_ICU + pp2_charset_t relevance_pct; + pp2_charset_t sort_pct; + pp2_charset_t mergekey_pct; struct conf_service *service; struct conf_server *next; diff --git a/src/http_command.c b/src/http_command.c index 74fd2d4..052f025 100644 --- a/src/http_command.c +++ b/src/http_command.c @@ -1,4 +1,4 @@ -/* $Id: http_command.c,v 1.61 2007-09-05 09:13:32 adam Exp $ +/* $Id: http_command.c,v 1.62 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -20,7 +20,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA */ /* - * $Id: http_command.c,v 1.61 2007-09-05 09:13:32 adam Exp $ + * $Id: http_command.c,v 1.62 2007-09-10 16:25:50 adam Exp $ */ #include @@ -455,7 +455,7 @@ static void write_metadata(WRBUF w, struct conf_service *service, switch (cmd->type) { case Metadata_type_generic: - wrbuf_xmlputs(w, md->data.text); + wrbuf_xmlputs(w, md->data.text.disp); break; case Metadata_type_year: wrbuf_printf(w, "%d", md->data.number.min); diff --git a/src/logic.c b/src/logic.c index 8ba450c..8e05608 100644 --- a/src/logic.c +++ b/src/logic.c @@ -1,4 +1,4 @@ -/* $Id: logic.c,v 1.65 2007-09-07 10:27:14 adam Exp $ +/* $Id: logic.c,v 1.66 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -708,15 +708,6 @@ static void session_init_databases_fun(void *context, struct database *db) new->database = db; new->yaz_marc = 0; -#ifdef HAVE_ICU - if (global_parameters.server && global_parameters.server->icu_chn) - new->pct = pp2_charset_create(global_parameters.server->icu_chn); - else - new->pct = pp2_charset_create(0); -#else // HAVE_ICU - new->pct = pp2_charset_create(0); -#endif // HAVE_ICU - new->map = 0; new->settings = nmem_malloc(se->session_nmem, sizeof(struct settings *) * num); @@ -740,8 +731,6 @@ static void session_database_destroy(struct session_database *sdb) xsltFreeStylesheet(m->stylesheet); if (sdb->yaz_marc) yaz_marc_destroy(sdb->yaz_marc); - if (sdb->pct) - pp2_charset_destroy(sdb->pct); } // Initialize session_database list -- this represents this session's view @@ -862,7 +851,6 @@ struct session *new_session(NMEM nmem) session->watchlist[i].data = 0; session->watchlist[i].fun = 0; } - return session; } @@ -1084,7 +1072,8 @@ static struct record_metadata *record_metadata_init( char * p = value; p = normalize7bit_generic(p, " ,/.:(["); - rec_md->data.text = nmem_strdup(nmem, p); + rec_md->data.text.disp = nmem_strdup(nmem, p); + rec_md->data.text.sort = 0; } else if (type == Metadata_type_year) { @@ -1112,6 +1101,9 @@ struct record *ingest_record(struct client *cl, Z_External *rec, xmlChar *type = 0; xmlChar *value = 0; struct conf_service *service = global_parameters.server->service; + const char *norm_str = 0; + pp2_relevance_token_t prt = 0; + WRBUF norm_wr = 0; if (!xdoc) return 0; @@ -1123,15 +1115,34 @@ struct record *ingest_record(struct client *cl, Z_External *rec, xmlFreeDoc(xdoc); return 0; } - + record = record_create(se->nmem, service->num_metadata, service->num_sortkeys, cl, record_no); - mergekey_norm = (xmlChar *) nmem_strdup(se->nmem, (char*) mergekey); - xmlFree(mergekey); - normalize7bit_mergekey((char *) mergekey_norm, 0); + prt = pp2_relevance_tokenize( + global_parameters.server->mergekey_pct, (const char *) mergekey); + + norm_wr = wrbuf_alloc(); + + while ((norm_str = pp2_relevance_token_next(prt))) + { + if (*norm_str) + { + if (wrbuf_len(norm_wr)) + wrbuf_puts(norm_wr, " "); + wrbuf_puts(norm_wr, norm_str); + } + } + + mergekey_norm = (xmlChar *)nmem_strdup(se->nmem, wrbuf_cstr(norm_wr)); + wrbuf_destroy(norm_wr); + + pp2_relevance_token_destroy(prt); + + xmlFree(mergekey); + cluster = reclist_insert(se->reclist, global_parameters.server->service, record, (char *) mergekey_norm, @@ -1146,99 +1157,119 @@ struct record *ingest_record(struct client *cl, Z_External *rec, return 0; } relevance_newrec(se->relevance, cluster); - - - // now parsing XML record and adding data to cluster or record metadata - for (n = root->children; n; n = n->next) - { - if (type) - xmlFree(type); - if (value) - xmlFree(value); - type = value = 0; - - if (n->type != XML_ELEMENT_NODE) - continue; - if (!strcmp((const char *) n->name, "metadata")) - { - struct conf_metadata *ser_md = 0; - struct conf_sortkey *ser_sk = 0; - struct record_metadata **wheretoput = 0; - struct record_metadata *rec_md = 0; - int md_field_id = -1; - int sk_field_id = -1; - - type = xmlGetProp(n, (xmlChar *) "type"); - value = xmlNodeListGetString(xdoc, n->children, 1); - - if (!type || !value || !*value) - continue; - - md_field_id - = conf_service_metadata_field_id(service, (const char *) type); - if (md_field_id < 0) - { - yaz_log(YLOG_WARN, - "Ignoring unknown metadata element: %s", type); - continue; - } - - ser_md = &service->metadata[md_field_id]; - - if (ser_md->sortkey_offset >= 0){ - sk_field_id = ser_md->sortkey_offset; - ser_sk = &service->sortkeys[sk_field_id]; - } - - // non-merged metadata - rec_md = record_metadata_init(se->nmem, (char *) value, - ser_md->type); - if (!rec_md) - { - yaz_log(YLOG_WARN, "bad metadata data '%s' for element '%s'", - value, type); - continue; - } - rec_md->next = record->metadata[md_field_id]; - record->metadata[md_field_id] = rec_md; - - // merged metadata - rec_md = record_metadata_init(se->nmem, (char *) value, - ser_md->type); - wheretoput = &cluster->metadata[md_field_id]; - - // and polulate with data: - // assign cluster or record based on merge action - if (ser_md->merge == Metadata_merge_unique) - { - struct record_metadata *mnode; - for (mnode = *wheretoput; mnode; mnode = mnode->next) - if (!strcmp((const char *) mnode->data.text, - rec_md->data.text)) - break; - if (!mnode) - { - rec_md->next = *wheretoput; - *wheretoput = rec_md; - } - } - else if (ser_md->merge == Metadata_merge_longest) - { - if (!*wheretoput - || strlen(rec_md->data.text) - > strlen((*wheretoput)->data.text)) - { - *wheretoput = rec_md; - if (ser_sk) - { - char *s = nmem_strdup(se->nmem, rec_md->data.text); - if (!cluster->sortkeys[sk_field_id]) - cluster->sortkeys[sk_field_id] = - nmem_malloc(se->nmem, - sizeof(union data_types)); - normalize7bit_mergekey(s, - (ser_sk->type == Metadata_sortkey_skiparticle)); - cluster->sortkeys[sk_field_id]->text = s; + + + // now parsing XML record and adding data to cluster or record metadata + for (n = root->children; n; n = n->next) + { + if (type) + xmlFree(type); + if (value) + xmlFree(value); + type = value = 0; + + if (n->type != XML_ELEMENT_NODE) + continue; + if (!strcmp((const char *) n->name, "metadata")) + { + struct conf_metadata *ser_md = 0; + struct conf_sortkey *ser_sk = 0; + struct record_metadata **wheretoput = 0; + struct record_metadata *rec_md = 0; + int md_field_id = -1; + int sk_field_id = -1; + + type = xmlGetProp(n, (xmlChar *) "type"); + value = xmlNodeListGetString(xdoc, n->children, 1); + + if (!type || !value || !*value) + continue; + + md_field_id + = conf_service_metadata_field_id(service, (const char *) type); + if (md_field_id < 0) + { + yaz_log(YLOG_WARN, + "Ignoring unknown metadata element: %s", type); + continue; + } + + ser_md = &service->metadata[md_field_id]; + + if (ser_md->sortkey_offset >= 0){ + sk_field_id = ser_md->sortkey_offset; + ser_sk = &service->sortkeys[sk_field_id]; + } + + // non-merged metadata + rec_md = record_metadata_init(se->nmem, (char *) value, + ser_md->type); + if (!rec_md) + { + yaz_log(YLOG_WARN, "bad metadata data '%s' for element '%s'", + value, type); + continue; + } + rec_md->next = record->metadata[md_field_id]; + record->metadata[md_field_id] = rec_md; + + // merged metadata + rec_md = record_metadata_init(se->nmem, (char *) value, + ser_md->type); + wheretoput = &cluster->metadata[md_field_id]; + + // and polulate with data: + // assign cluster or record based on merge action + if (ser_md->merge == Metadata_merge_unique) + { + struct record_metadata *mnode; + for (mnode = *wheretoput; mnode; mnode = mnode->next) + if (!strcmp((const char *) mnode->data.text.disp, + rec_md->data.text.disp)) + break; + if (!mnode) + { + rec_md->next = *wheretoput; + *wheretoput = rec_md; + } + } + else if (ser_md->merge == Metadata_merge_longest) + { + if (!*wheretoput + || strlen(rec_md->data.text.disp) + > strlen((*wheretoput)->data.text.disp)) + { + *wheretoput = rec_md; + if (ser_sk) + { + const char *sort_str = 0; + int skip_article = + ser_sk->type == Metadata_sortkey_skiparticle; + + if (!cluster->sortkeys[sk_field_id]) + cluster->sortkeys[sk_field_id] = + nmem_malloc(se->nmem, + sizeof(union data_types)); + + prt = pp2_relevance_tokenize( + global_parameters.server->sort_pct, + rec_md->data.text.disp); + + pp2_relevance_token_next(prt); + + sort_str = pp2_get_sort(prt, skip_article); + + cluster->sortkeys[sk_field_id]->text.disp = + rec_md->data.text.disp; + cluster->sortkeys[sk_field_id]->text.sort = + nmem_strdup(se->nmem, sort_str); +#if 0 + yaz_log(YLOG_LOG, "text disp=%s", + cluster->sortkeys[sk_field_id]->text.disp); + yaz_log(YLOG_LOG, "text sort=%s", + cluster->sortkeys[sk_field_id]->text.sort); +#endif + pp2_relevance_token_destroy(prt); } } } diff --git a/src/pazpar2.h b/src/pazpar2.h index e6d59d3..4f14b74 100644 --- a/src/pazpar2.h +++ b/src/pazpar2.h @@ -1,4 +1,4 @@ -/* $Id: pazpar2.h,v 1.49 2007-09-05 08:40:12 adam Exp $ +/* $Id: pazpar2.h,v 1.50 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -96,7 +96,6 @@ struct database_retrievalmap { // for that session struct session_database { - pp2_charset_t pct; struct database *database; struct setting **settings; yaz_marc_t yaz_marc; diff --git a/src/reclists.c b/src/reclists.c index f9785a8..738fdc4 100644 --- a/src/reclists.c +++ b/src/reclists.c @@ -1,4 +1,4 @@ -/* $Id: reclists.c,v 1.22 2007-08-28 21:11:21 quinn Exp $ +/* $Id: reclists.c,v 1.23 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -176,14 +176,14 @@ static int reclist_cmp(const void *p1, const void *p2) union data_types *ut2 = r2->sortkeys[s->offset]; switch (s->type) { - char *s1, *s2; + const char *s1, *s2; case Metadata_sortkey_relevance: res = r2->relevance - r1->relevance; break; case Metadata_sortkey_string: - s1 = ut1 ? ut1->text : ""; - s2 = ut2 ? ut2->text : ""; + s1 = ut1 ? ut1->text.sort : ""; + s2 = ut2 ? ut2->text.sort : ""; res = strcmp(s2, s1); if (res) { diff --git a/src/record.h b/src/record.h index ac1e14b..2071621 100644 --- a/src/record.h +++ b/src/record.h @@ -1,4 +1,4 @@ -/* $Id: record.h,v 1.11 2007-07-16 17:01:46 adam Exp $ +/* $Id: record.h,v 1.12 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -28,7 +28,10 @@ struct client; struct conf_service; union data_types { - char *text; + struct { + const char *disp; + const char *sort; + } text; struct { int min; int max; diff --git a/src/test_record.c b/src/test_record.c index 99502b4..5b2af00 100644 --- a/src/test_record.c +++ b/src/test_record.c @@ -1,4 +1,4 @@ -/* $Id: test_record.c,v 1.8 2007-07-30 23:16:33 quinn Exp $ +/* $Id: test_record.c,v 1.9 2007-09-10 16:25:50 adam Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -45,7 +45,8 @@ void test_record(int argc, char **argv) struct client *client = 0; char * bla = "blabla"; union data_types data_text; - data_text.text = bla; + data_text.text.disp = bla; + data_text.text.sort = bla; union data_types data_num; diff --git a/test/Makefile.am b/test/Makefile.am index 61a89c2..ed2f00b 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,9 +1,9 @@ -# $Id: Makefile.am,v 1.4 2007-09-10 08:18:19 adam Exp $ +# $Id: Makefile.am,v 1.5 2007-09-10 16:25:51 adam Exp $ -check_SCRIPTS = test_http.sh +check_SCRIPTS = test_http.sh test_icu.sh EXTRA_DIST = run_pazpar2.sh marc21.xsl test_http.xml test_http.cfg \ - test_http_urls $(check_SCRIPTS) + test_http_urls test_icu_urls $(check_SCRIPTS) TESTS = $(check_SCRIPTS) @@ -13,4 +13,5 @@ CONFIG_CLEAN_FILES=*.log *.dif dist-hook: cp ${srcdir}/test_http_*.res $(distdir) + cp ${srcdir}/test_url_*.res $(distdir) diff --git a/test/test_icu.cfg b/test/test_icu.cfg new file mode 100644 index 0000000..0c2883e --- /dev/null +++ b/test/test_icu.cfg @@ -0,0 +1,62 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/test_icu.sh b/test/test_icu.sh new file mode 100755 index 0000000..b96b90b --- /dev/null +++ b/test/test_icu.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# $Id: test_icu.sh,v 1.1 2007-09-10 16:25:51 adam Exp $ +# + +# srcdir might be set by make +srcdir=${srcdir:-"."} + +if test -x ../src/pazpar2; then + if ../src/pazpar2 -V |grep icu: >/dev/null; then + exec ${srcdir}/run_pazpar2.sh test_icu + fi +fi +exit 0 +# Local Variables: +# mode:shell-script +# sh-indentation: 2 +# sh-basic-offset: 4 +# End: diff --git a/test/test_icu_1.res b/test/test_icu_1.res new file mode 100644 index 0000000..ae5db74 --- /dev/null +++ b/test/test_icu_1.res @@ -0,0 +1 @@ +OK11 \ No newline at end of file diff --git a/test/test_icu_2.res b/test/test_icu_2.res new file mode 100644 index 0000000..48f18ee --- /dev/null +++ b/test/test_icu_2.res @@ -0,0 +1,13 @@ +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + \ No newline at end of file diff --git a/test/test_icu_3.res b/test/test_icu_3.res new file mode 100644 index 0000000..d2e0e58 --- /dev/null +++ b/test/test_icu_3.res @@ -0,0 +1 @@ +OK \ No newline at end of file diff --git a/test/test_icu_4.res b/test/test_icu_4.res new file mode 100644 index 0000000..8f26b70 --- /dev/null +++ b/test/test_icu_4.res @@ -0,0 +1,78 @@ + +OK +0 +9 +10 +0 +9 + + +Washington metropolitan area rail computer feasibility study; +final report +1971 +Englund, Carl R +"Contract DOT-UT-10003." +title washington metropolitan area rail computer feasibility study author englund carl r medium book + + + +The use of passwords for controlled access to computer resources +1977 +Wood, Helen M +title the use of passwords for controlled access to computer resources author wood helen m medium book + + + +The Puget Sound Region +a portfolio of thematic computer maps +1974 +Mairs, John W +Scale of maps ca. 1:1,000,000 +title the puget sound region author mairs john w medium book + + + +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +title the computer bible author medium book + + + +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + +How to program a computer +Jack Collins + +2 +title how to program a computer author jack collins medium book + + + +Computer science & technology +proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976 +1977 +title computer science technology author medium book + + + +Computer processing of dynamic images from an Anger scintillation camera +the proceedings of a workshop +1974 +Includes bibliographical references and index +title computer processing of dynamic images from an anger scintillation camera author medium book + + + +A plan for community college computer development +1971 +Cover title +title a plan for community college computer development author medium book + + diff --git a/test/test_icu_5.res b/test/test_icu_5.res new file mode 100644 index 0000000..18a25a4 --- /dev/null +++ b/test/test_icu_5.res @@ -0,0 +1,78 @@ + +OK +0 +9 +10 +0 +9 + + +A plan for community college computer development +1971 +Cover title +title a plan for community college computer development author medium book + + + +Computer processing of dynamic images from an Anger scintillation camera +the proceedings of a workshop +1974 +Includes bibliographical references and index +title computer processing of dynamic images from an anger scintillation camera author medium book + + + +Computer science & technology +proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976 +1977 +title computer science technology author medium book + + + +How to program a computer +Jack Collins + +2 +title how to program a computer author jack collins medium book + + + +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +title the computer bible author medium book + + + +The Puget Sound Region +a portfolio of thematic computer maps +1974 +Mairs, John W +Scale of maps ca. 1:1,000,000 +title the puget sound region author mairs john w medium book + + + +The use of passwords for controlled access to computer resources +1977 +Wood, Helen M +title the use of passwords for controlled access to computer resources author wood helen m medium book + + + +Washington metropolitan area rail computer feasibility study; +final report +1971 +Englund, Carl R +"Contract DOT-UT-10003." +title washington metropolitan area rail computer feasibility study author englund carl r medium book + + diff --git a/test/test_icu_6.res b/test/test_icu_6.res new file mode 100644 index 0000000..53779f4 --- /dev/null +++ b/test/test_icu_6.res @@ -0,0 +1,78 @@ + +OK +0 +9 +10 +0 +9 + + +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +title the computer bible author medium book + + + +Computer science & technology +proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976 +1977 +title computer science technology author medium book + + + +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + +The use of passwords for controlled access to computer resources +1977 +Wood, Helen M +title the use of passwords for controlled access to computer resources author wood helen m medium book + + + +Computer processing of dynamic images from an Anger scintillation camera +the proceedings of a workshop +1974 +Includes bibliographical references and index +title computer processing of dynamic images from an anger scintillation camera author medium book + + + +The Puget Sound Region +a portfolio of thematic computer maps +1974 +Mairs, John W +Scale of maps ca. 1:1,000,000 +title the puget sound region author mairs john w medium book + + + +A plan for community college computer development +1971 +Cover title +title a plan for community college computer development author medium book + + + +Washington metropolitan area rail computer feasibility study; +final report +1971 +Englund, Carl R +"Contract DOT-UT-10003." +title washington metropolitan area rail computer feasibility study author englund carl r medium book + + + +How to program a computer +Jack Collins + +2 +title how to program a computer author jack collins medium book + + diff --git a/test/test_icu_7.res b/test/test_icu_7.res new file mode 100644 index 0000000..6a09703 --- /dev/null +++ b/test/test_icu_7.res @@ -0,0 +1,78 @@ + +OK +0 +9 +10 +0 +9 + + +A plan for community college computer development +1971 +Cover title +title a plan for community college computer development author medium book + + + +Washington metropolitan area rail computer feasibility study; +final report +1971 +Englund, Carl R +"Contract DOT-UT-10003." +title washington metropolitan area rail computer feasibility study author englund carl r medium book + + + +The Computer Bible +1973-1980 +Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates +title the computer bible author medium book + + + +Computer processing of dynamic images from an Anger scintillation camera +the proceedings of a workshop +1974 +Includes bibliographical references and index +title computer processing of dynamic images from an anger scintillation camera author medium book + + + +The Puget Sound Region +a portfolio of thematic computer maps +1974 +Mairs, John W +Scale of maps ca. 1:1,000,000 +title the puget sound region author mairs john w medium book + + + +Computer science & technology +proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976 +1977 +title computer science technology author medium book + + + +Reconstruction tomography in diagnostic radiology and nuclear medicine +proceedings of the workshop +1977 +Includes bibliographical references and index +title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book + + + +The use of passwords for controlled access to computer resources +1977 +Wood, Helen M +title the use of passwords for controlled access to computer resources author wood helen m medium book + + + +How to program a computer +Jack Collins + +2 +title how to program a computer author jack collins medium book + +