1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2009 Index Data
3 * See the file LICENSE for details.
16 #include <yaz/xmalloc.h>
18 #include <yaz/icu_I18N.h>
27 #include <unicode/ustring.h> /* some more string fcns*/
28 #include <unicode/uchar.h> /* char names */
30 enum icu_chain_step_type {
31 ICU_chain_step_type_none,
32 ICU_chain_step_type_display, /* convert to utf8 display format */
33 ICU_chain_step_type_casemap, /* apply utf16 charmap */
34 ICU_chain_step_type_transform, /* apply utf16 transform */
35 ICU_chain_step_type_tokenize, /* apply utf16 tokenization */
36 ICU_chain_step_type_transliterate /* apply utf16 tokenization */
41 /* type and action object */
42 enum icu_chain_step_type type;
44 struct icu_casemap * casemap;
45 struct icu_transform * transform;
46 struct icu_tokenizer * tokenizer;
48 /* temprary post-action utf16 buffer */
49 struct icu_buf_utf16 * buf16;
50 struct icu_chain_step * previous;
60 const char * src8cstr;
64 /* number of tokens returned so far */
67 /* utf8 output buffers */
68 struct icu_buf_utf8 * display8;
69 struct icu_buf_utf8 * norm8;
70 struct icu_buf_utf8 * sort8;
72 /* utf16 source buffer */
73 struct icu_buf_utf16 * src16;
75 /* linked list of chain steps */
76 struct icu_chain_step * steps;
79 int icu_check_status(UErrorCode status)
81 if (U_FAILURE(status))
83 yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
89 static struct icu_chain_step *icu_chain_step_create(
90 struct icu_chain * chain, enum icu_chain_step_type type,
91 const uint8_t * rule, struct icu_buf_utf16 * buf16,
94 struct icu_chain_step * step = 0;
96 if(!chain || !type || !rule)
99 step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
105 /* create auxilary objects */
108 case ICU_chain_step_type_display:
110 case ICU_chain_step_type_casemap:
111 step->u.casemap = icu_casemap_create(rule[0], status);
113 case ICU_chain_step_type_transform:
114 /* rule omitted. Only ID used */
115 step->u.transform = icu_transform_create((const char *) rule, 'f',
118 case ICU_chain_step_type_tokenize:
119 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
120 (char) rule[0], status);
122 case ICU_chain_step_type_transliterate:
123 /* we pass a dummy ID to utrans_openU.. */
124 step->u.transform = icu_transform_create("custom", 'f',
125 (const char *) rule, status);
134 static void icu_chain_step_destroy(struct icu_chain_step * step)
139 icu_chain_step_destroy(step->previous);
143 case ICU_chain_step_type_display:
145 case ICU_chain_step_type_casemap:
146 icu_casemap_destroy(step->u.casemap);
147 icu_buf_utf16_destroy(step->buf16);
149 case ICU_chain_step_type_transform:
150 case ICU_chain_step_type_transliterate:
151 icu_transform_destroy(step->u.transform);
152 icu_buf_utf16_destroy(step->buf16);
154 case ICU_chain_step_type_tokenize:
155 icu_tokenizer_destroy(step->u.tokenizer);
156 icu_buf_utf16_destroy(step->buf16);
164 struct icu_chain *icu_chain_create(const char *locale, int sort,
167 struct icu_chain * chain
168 = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
170 *status = U_ZERO_ERROR;
172 chain->locale = xstrdup(locale);
176 chain->coll = ucol_open((const char *) chain->locale, status);
178 if (U_FAILURE(*status))
181 chain->token_count = 0;
185 chain->display8 = icu_buf_utf8_create(0);
186 chain->norm8 = icu_buf_utf8_create(0);
187 chain->sort8 = icu_buf_utf8_create(0);
189 chain->src16 = icu_buf_utf16_create(0);
196 void icu_chain_destroy(struct icu_chain * chain)
201 ucol_close(chain->coll);
203 icu_buf_utf8_destroy(chain->display8);
204 icu_buf_utf8_destroy(chain->norm8);
205 icu_buf_utf8_destroy(chain->sort8);
207 icu_buf_utf16_destroy(chain->src16);
209 icu_chain_step_destroy(chain->steps);
210 xfree(chain->locale);
215 static struct icu_chain_step *icu_chain_insert_step(
216 struct icu_chain * chain, enum icu_chain_step_type type,
217 const uint8_t * rule, UErrorCode *status);
219 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
224 struct icu_chain * chain = 0;
226 *status = U_ZERO_ERROR;
228 if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
232 xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
233 (xmlChar *) "locale");
237 chain = icu_chain_create((const char *) xml_locale, sort, status);
245 for (node = xml_node->children; node; node = node->next)
248 struct icu_chain_step * step = 0;
250 if (node->type != XML_ELEMENT_NODE)
253 xml_rule = xmlGetProp(node, (xmlChar *) "rule");
255 if (!strcmp((const char *) node->name, "casemap"))
256 step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
257 (const uint8_t *) xml_rule, status);
258 else if (!strcmp((const char *) node->name, "transform"))
259 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
260 (const uint8_t *) xml_rule, status);
261 else if (!strcmp((const char *) node->name, "transliterate"))
262 step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
263 (const uint8_t *) xml_rule, status);
264 else if (!strcmp((const char *) node->name, "tokenize"))
265 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
266 (const uint8_t *) xml_rule, status);
267 else if (!strcmp((const char *) node->name, "display"))
268 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
269 (const uint8_t *) "", status);
270 else if (!strcmp((const char *) node->name, "normalize"))
272 yaz_log(YLOG_WARN, "Element %s is deprecated. "
273 "Use transform instead", node->name);
274 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
275 (const uint8_t *) xml_rule, status);
277 else if (!strcmp((const char *) node->name, "index")
278 || !strcmp((const char *) node->name, "sortkey"))
280 yaz_log(YLOG_WARN, "Element %s is no longer needed. "
281 "Remove it from the configuration", node->name);
285 yaz_log(YLOG_WARN, "Unknown element %s", node->name);
286 icu_chain_destroy(chain);
290 if (step && U_FAILURE(*status))
292 icu_chain_destroy(chain);
299 static struct icu_chain_step *icu_chain_insert_step(
300 struct icu_chain * chain, enum icu_chain_step_type type,
301 const uint8_t * rule, UErrorCode *status)
303 struct icu_chain_step * step = 0;
304 struct icu_buf_utf16 * src16 = 0;
305 struct icu_buf_utf16 * buf16 = 0;
307 if (!chain || !type || !rule)
310 /* assign utf16 src buffers as needed */
311 if (chain->steps && chain->steps->buf16)
312 src16 = chain->steps->buf16;
313 else if (chain->src16)
314 src16 = chain->src16;
318 /* create utf16 destination buffers as needed, or */
321 case ICU_chain_step_type_display:
324 case ICU_chain_step_type_casemap:
325 buf16 = icu_buf_utf16_create(0);
327 case ICU_chain_step_type_transform:
328 case ICU_chain_step_type_transliterate:
329 buf16 = icu_buf_utf16_create(0);
331 case ICU_chain_step_type_tokenize:
332 buf16 = icu_buf_utf16_create(0);
338 /* create actual chain step with this buffer */
339 step = icu_chain_step_create(chain, type, rule, buf16, status);
341 step->previous = chain->steps;
347 static int icu_chain_step_next_token(struct icu_chain * chain,
348 struct icu_chain_step * step,
351 struct icu_buf_utf16 * src16 = 0;
352 int got_new_token = 0;
354 if (!chain || !chain->src16 || !step || !step->more_tokens)
357 /* assign utf16 src buffers as needed, advance in previous steps
358 tokens until non-zero token met, and setting stop condition */
362 src16 = step->previous->buf16;
363 /* tokens might be killed in previous steps, therefore looping */
365 while (step->need_new_token
366 && step->previous->more_tokens
369 = icu_chain_step_next_token(chain, step->previous, status);
372 { /* first step can only work once on chain->src16 input buffer */
373 src16 = chain->src16;
374 step->more_tokens = 0;
381 /* stop if nothing to process */
382 if (step->need_new_token && !got_new_token)
384 step->more_tokens = 0;
388 /* either an old token not finished yet, or a new token, thus
389 perform the work, eventually put this steps output in
390 step->buf16 or the chains UTF8 output buffers */
394 case ICU_chain_step_type_display:
395 icu_utf16_to_utf8(chain->display8, src16, status);
397 case ICU_chain_step_type_casemap:
398 icu_casemap_casemap(step->u.casemap,
399 step->buf16, src16, status,
402 case ICU_chain_step_type_transform:
403 case ICU_chain_step_type_transliterate:
404 icu_transform_trans(step->u.transform,
405 step->buf16, src16, status);
407 case ICU_chain_step_type_tokenize:
408 /* attach to new src16 token only first time during splitting */
409 if (step->need_new_token)
411 icu_tokenizer_attach(step->u.tokenizer, src16, status);
412 step->need_new_token = 0;
415 /* splitting one src16 token into multiple buf16 tokens */
417 = icu_tokenizer_next_token(step->u.tokenizer,
418 step->buf16, status);
420 /* make sure to get new previous token if this one had been used up
421 by recursive call to _same_ step */
423 if (!step->more_tokens)
425 step->more_tokens = icu_chain_step_next_token(chain, step, status);
426 return step->more_tokens; /* avoid one token count too much! */
434 if (U_FAILURE(*status))
437 /* if token disappered into thin air, tell caller */
438 /* if (!step->buf16->utf16_len && !step->more_tokens) */
445 struct icu_chain *chain;
446 struct icu_buf_utf16 *next;
448 struct icu_buf_utf8 *display;
449 struct icu_buf_utf8 *sort8;
452 static void utf16_print(struct icu_buf_utf16 *src16)
454 UErrorCode status = U_ZERO_ERROR;
456 struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
457 icu_utf16_to_utf8(dst8, src16, &status);
459 assert(status != 1234);
460 if (U_FAILURE(status))
462 printf("utf8:failure\n");
466 p = icu_buf_utf8_to_cstr(dst8);
467 printf("utf8:%s\n", p);
469 icu_buf_utf8_destroy(dst8);
472 struct icu_buf_utf16 *icu_iter_invoke(struct icu_iter *iter,
473 struct icu_chain_step *step,
474 struct icu_buf_utf16 *src)
480 struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
484 case ICU_chain_step_type_casemap:
487 struct icu_buf_utf16 *src = dst;
489 dst = icu_buf_utf16_create(0);
490 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
491 iter->chain->locale);
492 icu_buf_utf16_destroy(src);
495 case ICU_chain_step_type_tokenize:
498 struct icu_buf_utf16 *src = dst;
500 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
501 icu_buf_utf16_destroy(src);
503 dst = icu_buf_utf16_create(0);
504 iter->status = U_ZERO_ERROR;
505 if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
507 icu_buf_utf16_destroy(dst);
511 case ICU_chain_step_type_transform:
512 case ICU_chain_step_type_transliterate:
515 struct icu_buf_utf16 *src = dst;
516 dst = icu_buf_utf16_create(0);
517 icu_transform_trans(step->u.transform, dst, src, &iter->status);
518 icu_buf_utf16_destroy(src);
521 case ICU_chain_step_type_display:
523 icu_utf16_to_utf8(iter->display, dst, &iter->status);
532 struct icu_iter *icu_iter_create(struct icu_chain *chain,
533 const char *src8cstr)
539 struct icu_buf_utf16 *src16 = icu_buf_utf16_create(0);
540 struct icu_iter *iter = xmalloc(sizeof(*iter));
542 iter->status = U_ZERO_ERROR;
543 iter->display = icu_buf_utf8_create(0);
544 iter->sort8 = icu_buf_utf8_create(0);
546 icu_utf16_from_utf8_cstr(src16, src8cstr, &iter->status);
547 iter->next = icu_iter_invoke(iter, chain->steps, src16);
552 void icu_iter_destroy(struct icu_iter *iter)
556 icu_buf_utf8_destroy(iter->display);
557 icu_buf_utf8_destroy(iter->sort8);
562 int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result)
564 struct icu_buf_utf16 *last = iter->next;
569 if (iter->chain->sort)
571 icu_sortkey8_from_utf16(iter->chain->coll,
575 icu_utf16_to_utf8(result, last, &iter->status);
576 iter->next = icu_iter_invoke(iter, iter->chain->steps, 0);
577 icu_buf_utf16_destroy(last);
582 const char *icu_iter_get_sortkey(struct icu_iter *iter)
584 return icu_buf_utf8_to_cstr(iter->sort8);
587 const char *icu_iter_get_display(struct icu_iter *iter)
589 return icu_buf_utf8_to_cstr(iter->display);
592 int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
595 struct icu_chain_step * stp = 0;
597 if (!chain || !src8cstr)
600 chain->src8cstr = src8cstr;
604 /* clear token count */
605 chain->token_count = 0;
607 /* clear all steps stop states */
610 stp->more_tokens = 1;
611 stp->need_new_token = 1;
615 /* finally convert UTF8 to UTF16 string if needed */
616 if (chain->steps || chain->sort)
617 icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
619 if (U_FAILURE(*status))
625 int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
629 *status = U_ZERO_ERROR;
634 /* special case with no steps - same as index type binary */
637 if (chain->token_count)
641 chain->token_count++;
644 icu_sortkey8_from_utf16(chain->coll,
645 chain->sort8, chain->steps->buf16,
647 return chain->token_count;
650 /* usual case, one or more icu chain steps existing */
653 while (!got_token && chain->steps && chain->steps->more_tokens)
654 got_token = icu_chain_step_next_token(chain, chain->steps, status);
658 chain->token_count++;
660 icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
663 icu_sortkey8_from_utf16(chain->coll,
664 chain->sort8, chain->steps->buf16,
666 return chain->token_count;
673 int icu_chain_token_number(struct icu_chain * chain)
678 return chain->token_count;
681 const char * icu_chain_token_display(struct icu_chain * chain)
684 return icu_buf_utf8_to_cstr(chain->display8);
689 const char * icu_chain_token_norm(struct icu_chain * chain)
692 return chain->src8cstr;
695 return icu_buf_utf8_to_cstr(chain->norm8);
700 const char * icu_chain_token_sortkey(struct icu_chain * chain)
703 return icu_buf_utf8_to_cstr(chain->sort8);
708 #endif /* YAZ_HAVE_ICU */
713 * c-file-style: "Stroustrup"
714 * indent-tabs-mode: nil
716 * vim: shiftwidth=4 tabstop=8 expandtab