#include <unicode/utrans.h>
+// #define ICU_CHAIN_SORTKEY
+#undef ICU_CHAIN_SORTKEY
/* declared structs and functions */
enum icu_chain_step_type {
ICU_chain_step_type_none,
ICU_chain_step_type_display, /* convert to utf8 display format */
+#ifdef ICU_CHAIN_SORTKEY
ICU_chain_step_type_index, /* convert to utf8 index format */
ICU_chain_step_type_sortkey, /* convert to utf8 sortkey format */
+#endif
ICU_chain_step_type_casemap, /* apply utf16 charmap */
ICU_chain_step_type_normalize, /* apply utf16 normalization */
ICU_chain_step_type_tokenize /* apply utf16 tokenization */
struct icu_chain
{
- uint8_t identifier[128];
uint8_t locale[16];
+ int sort;
+
+ UCollator * coll;
/* number of tokens returned so far */
int32_t token_count;
struct icu_chain_step * steps;
};
-struct icu_chain * icu_chain_create( // const uint8_t * identifier,
- const uint8_t * locale);
+struct icu_chain * icu_chain_create(const uint8_t * locale,
+ int sort,
+ UErrorCode * status);
void icu_chain_destroy(struct icu_chain * chain);
struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
- const uint8_t * locale,
+ const uint8_t * locale,
+ int sort,
UErrorCode * status);
-
struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
enum icu_chain_step_type type,
const uint8_t * rule,
UErrorCode *status);
-
int icu_chain_step_next_token(struct icu_chain * chain,
struct icu_chain_step * step,
UErrorCode *status);
const char * icu_chain_get_sort(struct icu_chain * chain);
+const UCollator * icu_chain_get_coll(struct icu_chain * chain);
+
#endif /* ICU_I18NL_H */
/*
* Copyright (C) 1995-2007, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: icu_I18N.c,v 1.5 2007-10-24 14:48:17 marc Exp $
+ * $Id: icu_I18N.c,v 1.6 2007-10-25 08:32:50 marc Exp $
*/
#if HAVE_CONFIG_H
switch(step->type) {
case ICU_chain_step_type_display:
break;
+#ifdef ICU_CHAIN_SORTKEY
case ICU_chain_step_type_index:
break;
case ICU_chain_step_type_sortkey:
break;
+#endif
case ICU_chain_step_type_casemap:
step->u.casemap = icu_casemap_create((char *) chain->locale,
(char) rule[0], status);
switch(step->type) {
case ICU_chain_step_type_display:
break;
+#ifdef ICU_CHAIN_SORTKEY
case ICU_chain_step_type_index:
break;
case ICU_chain_step_type_sortkey:
break;
+#endif
case ICU_chain_step_type_casemap:
icu_casemap_destroy(step->u.casemap);
icu_buf_utf16_destroy(step->buf16);
-struct icu_chain * icu_chain_create( //const uint8_t * identifier,
- const uint8_t * locale)
+struct icu_chain * icu_chain_create(const uint8_t * locale,
+ int sort,
+ UErrorCode * status)
{
struct icu_chain * chain
= (struct icu_chain *) malloc(sizeof(struct icu_chain));
- //strncpy((char *) chain->identifier, (const char *) identifier, 128);
- //chain->identifier[128 - 1] = '\0';
strncpy((char *) chain->locale, (const char *) locale, 16);
chain->locale[16 - 1] = '\0';
+ chain->sort = sort;
+
+ chain->coll = ucol_open((const char *) chain->locale, status);
+
+ if (U_FAILURE(*status))
+ return 0;
+
+
chain->token_count = 0;
chain->display8 = icu_buf_utf8_create(0);
chain->steps = 0;
+
return chain;
}
void icu_chain_destroy(struct icu_chain * chain)
{
if (chain){
+
+ if (chain->coll)
+ ucol_close(chain->coll);
+
icu_buf_utf8_destroy(chain->display8);
icu_buf_utf8_destroy(chain->norm8);
icu_buf_utf8_destroy(chain->sort8);
struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
const uint8_t * locale,
+ int sort,
UErrorCode * status){
xmlNode *node = 0;
return 0;
- chain = icu_chain_create( // (const uint8_t *) xml_id,
- locale);
+ chain = icu_chain_create(locale, sort, status);
if (!chain)
return 0;
step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
(const uint8_t *) "", status);
}
+#ifdef ICU_CHAIN_SORTKEY
else if (!strcmp((const char *) node->name,
(const char *) "index")){
step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey,
(const uint8_t *) "", status);
}
-
+#endif
xmlFree(xml_rule);
if (!step || U_FAILURE(*status)){
icu_chain_destroy(chain);
case ICU_chain_step_type_display:
buf16 = src16;
break;
+#ifdef ICU_CHAIN_SORTKEY
case ICU_chain_step_type_index:
buf16 = src16;
break;
case ICU_chain_step_type_sortkey:
buf16 = src16;
break;
+#endif
case ICU_chain_step_type_casemap:
buf16 = icu_buf_utf16_create(0);
break;
case ICU_chain_step_type_display:
icu_utf16_to_utf8(chain->display8, src16, status);
break;
+#ifdef ICU_CHAIN_SORTKEY
+ // TODO
case ICU_chain_step_type_index:
icu_utf16_to_utf8(chain->norm8, src16, status);
break;
case ICU_chain_step_type_sortkey:
icu_utf16_to_utf8(chain->sort8, src16, status);
+ //UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
+ // struct icu_buf_utf8 * dest8,
+ // struct icu_buf_utf16 * src16,
+ // UErrorCode * status);
break;
+#endif
case ICU_chain_step_type_casemap:
icu_casemap_casemap(step->u.casemap,
step->buf16, src16, status);
/* if token disappered into thin air, tell caller */
if (!step->buf16->utf16_len)
return 0;
+
+ if (U_FAILURE(*status))
+ return 0;
return 1;
}
if (!chain || !chain->steps)
return 0;
- got_token = icu_chain_step_next_token(chain, chain->steps, status);
+ while(!got_token && chain->steps->more_tokens)
+ got_token = icu_chain_step_next_token(chain, chain->steps, status);
if (got_token){
chain->token_count++;
+
+ icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
+
+ icu_sortkey8_from_utf16(chain->coll,
+ chain->sort8, chain->steps->buf16, status);
+
return chain->token_count;
}
return 0;
}
+const UCollator * icu_chain_get_coll(struct icu_chain * chain)
+{
+ return chain->coll;
+}
+
+
+
#endif /* HAVE_ICU */
-/* $Id: tst_icu_I18N.c,v 1.6 2007-10-24 14:48:17 marc Exp $
+/* $Id: tst_icu_I18N.c,v 1.7 2007-10-25 08:32:51 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
"<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
"<display/>"
"<casemap rule=\"l\"/>"
- "<index/>"
- "<sortkey/>"
"</icu_chain>";
// printf("ICU chain:\ninput: '%s'\n", en_str);
- chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", &status);
-
-#if 0
- chain = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en");
- step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
- (const uint8_t *) "[:Control:] Any-Remove",
- &status);
- step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
- (const uint8_t *) "s",
- &status);
- step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
- (const uint8_t *) "l",
- &status);
- step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
- (const uint8_t *)
- "[[:WhiteSpace:][:Punctuation:]] Any-Remove",
- &status);
- step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
- (const uint8_t *)"",
- &status);
-/* step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */
-/* (const uint8_t *) "Lower", */
-/* &status); */
- step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
- (const uint8_t *) "l",
- &status);
- step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
- (const uint8_t *)"",
- &status);
-/* step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, */
-/* (const uint8_t *)"", */
-/* &status); */
-
-#endif
+ chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", 0, &status);
xmlFreeDoc(doc);
YAZ_CHECK(chain);
"<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
"<display/>"
"<casemap rule=\"l\"/>"
- "<index/>"
- "<sortkey/>"
"</icu_chain>";
xmlNode *xml_node = xmlDocGetRootElement(doc);
YAZ_CHECK(xml_node);
- chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", &status);
+ chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", 0, &status);
xmlFreeDoc(doc);
YAZ_CHECK(chain);
const char * xml_str = "<icu_chain>"
"<tokenize rule=\"w\"/>"
"<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
- "<index/>"
"</icu_chain>";
xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
xmlNode *xml_node = xmlDocGetRootElement(doc);
YAZ_CHECK(xml_node);
- chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", &status);
+ chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", 0, &status);
xmlFreeDoc(doc);
YAZ_CHECK(chain);