-/* $Id: icu_I18N.c,v 1.14 2007-05-16 12:39:49 marc Exp $
+/* $Id: icu_I18N.c,v 1.22 2007-05-25 13:27:21 marc Exp $
Copyright (c) 2006-2007, Index Data.
This file is part of Pazpar2.
icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
+ dest16->utf16_len = src16->utf16_len;
return dest16;
};
+struct icu_casemap * icu_casemap_create(const char *locale, char action,
+ UErrorCode *status)
+{
+ struct icu_casemap * casemap
+ = (struct icu_casemap *) malloc(sizeof(struct icu_casemap));
+ strcpy(casemap->locale, locale);
+ casemap->action = action;
+
+ switch(casemap->action) {
+ case 'l':
+ break;
+ case 'u':
+ break;
+ case 't':
+ break;
+ case 'f':
+ break;
+ default:
+ icu_casemap_destroy(casemap);
+ return 0;
+ }
+
+ return casemap;
+};
+
+void icu_casemap_destroy(struct icu_casemap * casemap)
+{
+ if (casemap)
+ free(casemap);
+};
+
+
+int icu_casemap_casemap(struct icu_casemap * casemap,
+ struct icu_buf_utf16 * dest16,
+ struct icu_buf_utf16 * src16,
+ UErrorCode *status)
+{
+ if(!casemap)
+ return 0;
+
+ return icu_utf16_casemap(dest16, src16,
+ casemap->locale, casemap->action, status);
+};
+
+
int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
struct icu_buf_utf16 * src16,
const char *locale, char action,
-//struct icu_normalizer
-//{
-// char action;
-// struct icu_buf_utf16 * rules16;
-// UParseError parse_error[256];
-// UTransliterator * trans;
-//};
-
-
struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
UErrorCode *status)
{
UTRANS_FORWARD,
0, 0,
normalizer->parse_error, status);
+ // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
break;
case 'r':
normalizer->trans
UTRANS_REVERSE ,
0, 0,
normalizer->parse_error, status);
+ // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
break;
default:
*status = U_UNSUPPORTED_ERROR;
if (normalizer->rules16)
icu_buf_utf16_destroy(normalizer->rules16);
if (normalizer->trans)
+ {
+ // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
utrans_close(normalizer->trans);
+ }
free(normalizer);
}
};
step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
step->type = type;
- step->more_tokens = 0;
- if (buf16)
- step->buf16 = buf16;
- else
- step->buf16 = 0;
+ step->buf16 = buf16;
// create auxilary objects
switch(step->type) {
case ICU_chain_step_type_display:
break;
- case ICU_chain_step_type_norm:
+ case ICU_chain_step_type_index:
break;
- case ICU_chain_step_type_sort:
+ case ICU_chain_step_type_sortkey:
break;
- case ICU_chain_step_type_charmap:
+ case ICU_chain_step_type_casemap:
+ step->u.casemap = icu_casemap_create((char *) chain->locale,
+ (char) rule[0], status);
break;
case ICU_chain_step_type_normalize:
step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
switch(step->type) {
case ICU_chain_step_type_display:
break;
- case ICU_chain_step_type_norm:
+ case ICU_chain_step_type_index:
break;
- case ICU_chain_step_type_sort:
+ case ICU_chain_step_type_sortkey:
break;
- case ICU_chain_step_type_charmap:
+ case ICU_chain_step_type_casemap:
+ icu_casemap_destroy(step->u.casemap);
icu_buf_utf16_destroy(step->buf16);
break;
case ICU_chain_step_type_normalize:
default:
break;
}
-
-
+ free(step);
};
void icu_chain_destroy(struct icu_chain * chain)
{
- icu_buf_utf8_destroy(chain->display8);
- icu_buf_utf8_destroy(chain->norm8);
- icu_buf_utf8_destroy(chain->sort8);
+ if (chain){
+ icu_buf_utf8_destroy(chain->display8);
+ icu_buf_utf8_destroy(chain->norm8);
+ icu_buf_utf8_destroy(chain->sort8);
+
+ icu_buf_utf16_destroy(chain->src16);
+
+ icu_chain_step_destroy(chain->steps);
+ free(chain);
+ }
+};
+
- icu_buf_utf16_destroy(chain->src16);
- icu_chain_step_destroy(chain->steps);
+struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
+ UErrorCode * status){
+
+ xmlNode *node = 0;
+ struct icu_chain * chain = 0;
+
+ if (!xml_node
+ ||xml_node->type != XML_ELEMENT_NODE
+ || strcmp((const char *) xml_node->name, "icu_chain"))
+
+ return 0;
+
+ xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
+ xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
+
+ if (!xml_id || !strlen((const char *) xml_id)
+ || !xml_locale || !strlen((const char *) xml_locale))
+ return 0;
+
+ chain = icu_chain_create((const uint8_t *) xml_id,
+ (const uint8_t *) xml_locale);
+
+ xmlFree(xml_id);
+ xmlFree(xml_locale);
+ if (!chain)
+ return 0;
+
+ for (node = xml_node->children; node; node = node->next)
+ {
+ if (node->type != XML_ELEMENT_NODE)
+ continue;
+
+ xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
+ struct icu_chain_step * step = 0;
+
+ if (!strcmp((const char *) node->name,
+ (const char *) "casemap")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "normalize")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "tokenize")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
+ (const uint8_t *) xml_rule, status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "display")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
+ (const uint8_t *) "", status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "index")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
+ (const uint8_t *) "", status);
+ }
+ else if (!strcmp((const char *) node->name,
+ (const char *) "sortkey")){
+ step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey,
+ (const uint8_t *) "", status);
+ }
+
+ xmlFree(xml_rule);
+ if (!step || U_FAILURE(*status)){
+ icu_chain_destroy(chain);
+ return 0;
+ }
+
+
+ }
+
+ return chain;
};
+
struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
enum icu_chain_step_type type,
const uint8_t * rule,
return 0;
- // assign utf16 destination buffers as needed, or
- // re-use previous uft18 buffer if this step does not touch it
+ // create utf16 destination buffers as needed, or
switch(type) {
case ICU_chain_step_type_display:
buf16 = src16;
break;
- case ICU_chain_step_type_norm:
+ case ICU_chain_step_type_index:
buf16 = src16;
break;
- case ICU_chain_step_type_sort:
+ case ICU_chain_step_type_sortkey:
buf16 = src16;
break;
- case ICU_chain_step_type_charmap:
+ case ICU_chain_step_type_casemap:
buf16 = icu_buf_utf16_create(0);
break;
case ICU_chain_step_type_normalize:
{
struct icu_buf_utf16 * src16 = 0;
- printf("icu_chain_step_next_token %d\n", (int) step);
+ //printf("icu_chain_step_next_token %d\n", (int) step);
if (!chain || !chain->src16 || !step || !step->more_tokens)
return 0;
// assign utf16 src buffers as neeed, advance in previous steps
- // tokens, and setting stop condition
+ // tokens until non-zero token met, and setting stop condition
if (step->previous){
src16 = step->previous->buf16;
- step->more_tokens
- = icu_chain_step_next_token(chain, step->previous, status);
+ if (step->need_new_token)
+ //while (step->more_tokens && !src16->utf16_len)
+ step->more_tokens
+ = icu_chain_step_next_token(chain, step->previous, status);
}
else { // first step can only work once on chain->src16 input buffer
src16 = chain->src16;
// stop if nothing to process
// i.e new token source was not properly assigned
- if (!step->more_tokens || !src16 || !src16->utf16_len) //
+ if (!step->more_tokens || !src16) // || !src16->utf16_len
return 0;
- printf("icu_chain_step_next_token %d working\n", (int) step);
+ //printf("icu_chain_step_next_token %d working\n", (int) step);
// perform the work, eventually put this steps output in
case ICU_chain_step_type_display:
icu_utf16_to_utf8(chain->display8, src16, status);
break;
- case ICU_chain_step_type_norm:
+ case ICU_chain_step_type_index:
icu_utf16_to_utf8(chain->norm8, src16, status);
break;
- case ICU_chain_step_type_sort:
+ case ICU_chain_step_type_sortkey:
icu_utf16_to_utf8(chain->sort8, src16, status);
break;
- case ICU_chain_step_type_charmap:
+ case ICU_chain_step_type_casemap:
+ icu_casemap_casemap(step->u.casemap,
+ step->buf16, src16, status);
break;
case ICU_chain_step_type_normalize:
icu_normalizer_normalize(step->u.normalizer,
step->buf16, src16, status);
break;
case ICU_chain_step_type_tokenize:
- // step->more_tokens
- // = icu_tokenizer_next_token(step->u.tokenizer,
- // step->buf16, status);
+ // attach to new src16 token only first time during splitting
+ if (step->need_new_token){
+ icu_tokenizer_attach(step->u.tokenizer, src16, status);
+ step->need_new_token = 0;
+ }
+ // splitting one src16 token into multiple buf16 tokens
+ step->more_tokens
+ = icu_tokenizer_next_token(step->u.tokenizer,
+ step->buf16, status);
+ // make sure to get new previous token if this one had been used up
+ if (step->previous && !step->more_tokens){
+ if (icu_chain_step_next_token(chain, step->previous, status)){
+ icu_tokenizer_attach(step->u.tokenizer, src16, status);
+ step->need_new_token = 0;
+ step->more_tokens
+ = icu_tokenizer_next_token(step->u.tokenizer,
+ step->buf16, status);
+ }
+ }
+ if (0 == step->more_tokens)
+ return 0;
break;
default:
return 0;
break;
}
-
- // stop further token processing if last step
- if (!step->previous)
+
+
+ // stop further token processing if last step and
+ // new tokens are needed from previous (non-existing) step
+ if (!step->previous && step->need_new_token)
step->more_tokens = 0;
+ //printf("%d %d %d\n",
+ // step->more_tokens, src16->utf16_len, step->buf16->utf16_len);
+
if (U_FAILURE(*status))
return 0;
const char * src8cstr,
UErrorCode *status)
{
- struct icu_chain_step * stp = chain->steps;
+ struct icu_chain_step * stp = 0;
if (!chain || !src8cstr)
return 0;
+
+ stp = chain->steps;
// clear token count
chain->token_count = 0;
while (stp){
stp->more_tokens = 1;
+ stp->need_new_token = 1;
stp = stp->previous;
}