1 /* $Id: icu_I18N.c,v 1.14 2007-05-16 12:39:49 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
28 #include <yaz/timing.h>
41 #include <unicode/ustring.h> /* some more string fcns*/
42 #include <unicode/uchar.h> /* char names */
45 //#include <unicode/ustdio.h>
46 //#include <unicode/utypes.h> /* Basic ICU data types */
47 #include <unicode/ucol.h>
48 //#include <unicode/ucnv.h> /* C Converter API */
49 //#include <unicode/uloc.h>
50 //#include <unicode/ubrk.h>
51 /* #include <unicode/unistr.h> */
56 int icu_check_status (UErrorCode status)
58 if(U_FAILURE(status)){
60 "ICU: %d %s\n", status, u_errorName(status));
69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
71 struct icu_buf_utf16 * buf16
72 = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
79 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
80 buf16->utf16[0] = (UChar) 0;
81 buf16->utf16_cap = capacity;
87 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
92 if (0 == buf16->utf16)
93 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
96 = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
97 buf16->utf16[0] = (UChar) 0;
99 buf16->utf16_cap = capacity;
105 buf16->utf16_len = 0;
106 buf16->utf16_cap = 0;
114 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
115 struct icu_buf_utf16 * src16)
121 if (dest16->utf16_cap < src16->utf16_len)
122 icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
124 u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
130 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
144 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
146 struct icu_buf_utf8 * buf8
147 = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
154 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
155 buf8->utf8[0] = (uint8_t) 0;
156 buf8->utf8_cap = capacity;
163 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
169 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
172 = (uint8_t *) realloc(buf8->utf8,
173 sizeof(uint8_t) * capacity);
174 buf8->utf8[0] = (uint8_t) 0;
176 buf8->utf8_cap = capacity;
191 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
192 struct icu_buf_utf8 * src8)
199 if (dest8->utf8_cap < src8->utf8_len)
200 icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
202 strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
209 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
220 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
221 struct icu_buf_utf8 * src8,
224 int32_t utf16_len = 0;
226 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
228 (const char *) src8->utf8, src8->utf8_len, status);
230 // check for buffer overflow, resize and retry
231 if (*status == U_BUFFER_OVERFLOW_ERROR
232 //|| dest16->utf16_len > dest16->utf16_cap
234 icu_buf_utf16_resize(dest16, utf16_len * 2);
235 *status = U_ZERO_ERROR;
236 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
238 (const char *) src8->utf8, src8->utf8_len, status);
241 //if (*status != U_BUFFER_OVERFLOW_ERROR
242 if (U_SUCCESS(*status)
243 && utf16_len < dest16->utf16_cap)
244 dest16->utf16_len = utf16_len;
246 dest16->utf16[0] = (UChar) 0;
247 dest16->utf16_len = 0;
255 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
256 const char * src8cstr,
259 size_t src8cstr_len = 0;
260 int32_t utf16_len = 0;
262 src8cstr_len = strlen(src8cstr);
264 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
266 src8cstr, src8cstr_len, status);
268 // check for buffer overflow, resize and retry
269 if (*status == U_BUFFER_OVERFLOW_ERROR
270 //|| dest16->utf16_len > dest16->utf16_cap
272 icu_buf_utf16_resize(dest16, utf16_len * 2);
273 *status = U_ZERO_ERROR;
274 u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
276 src8cstr, src8cstr_len, status);
279 // if (*status != U_BUFFER_OVERFLOW_ERROR
280 if (U_SUCCESS(*status)
281 && utf16_len < dest16->utf16_cap)
282 dest16->utf16_len = utf16_len;
284 dest16->utf16[0] = (UChar) 0;
285 dest16->utf16_len = 0;
294 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
295 struct icu_buf_utf16 * src16,
298 int32_t utf8_len = 0;
300 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
302 src16->utf16, src16->utf16_len, status);
304 // check for buffer overflow, resize and retry
305 if (*status == U_BUFFER_OVERFLOW_ERROR
306 //|| dest8->utf8_len > dest8->utf8_cap
308 icu_buf_utf8_resize(dest8, utf8_len * 2);
309 *status = U_ZERO_ERROR;
310 u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
312 src16->utf16, src16->utf16_len, status);
316 //if (*status != U_BUFFER_OVERFLOW_ERROR
317 if (U_SUCCESS(*status)
318 && utf8_len < dest8->utf8_cap)
319 dest8->utf8_len = utf8_len;
321 dest8->utf8[0] = (uint8_t) 0;
330 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
331 struct icu_buf_utf16 * src16,
332 const char *locale, char action,
335 int32_t dest16_len = 0;
339 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
340 src16->utf16, src16->utf16_len,
344 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
345 src16->utf16, src16->utf16_len,
349 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
350 src16->utf16, src16->utf16_len,
354 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
355 src16->utf16, src16->utf16_len,
356 U_FOLD_CASE_DEFAULT, status);
360 return U_UNSUPPORTED_ERROR;
364 // check for buffer overflow, resize and retry
365 if (*status == U_BUFFER_OVERFLOW_ERROR
366 && dest16 != src16 // do not resize if in-place conversion
367 //|| dest16_len > dest16->utf16_cap
369 icu_buf_utf16_resize(dest16, dest16_len * 2);
370 *status = U_ZERO_ERROR;
375 dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
376 src16->utf16, src16->utf16_len,
380 dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
381 src16->utf16, src16->utf16_len,
385 dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
386 src16->utf16, src16->utf16_len,
390 dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
391 src16->utf16, src16->utf16_len,
392 U_FOLD_CASE_DEFAULT, status);
396 return U_UNSUPPORTED_ERROR;
401 if (U_SUCCESS(*status)
402 && dest16_len < dest16->utf16_cap)
403 dest16->utf16_len = dest16_len;
405 dest16->utf16[0] = (UChar) 0;
406 dest16->utf16_len = 0;
414 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
415 struct icu_buf_utf8 * dest8,
416 struct icu_buf_utf16 * src16,
420 int32_t sortkey_len = 0;
422 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
423 dest8->utf8, dest8->utf8_cap);
425 // check for buffer overflow, resize and retry
426 if (sortkey_len > dest8->utf8_cap) {
427 icu_buf_utf8_resize(dest8, sortkey_len * 2);
428 sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
429 dest8->utf8, dest8->utf8_cap);
432 if (U_SUCCESS(*status)
434 dest8->utf8_len = sortkey_len;
436 dest8->utf8[0] = (UChar) 0;
445 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
448 struct icu_tokenizer * tokenizer
449 = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
451 strcpy(tokenizer->locale, locale);
452 tokenizer->action = action;
454 tokenizer->buf16 = 0;
455 tokenizer->token_count = 0;
456 tokenizer->token_id = 0;
457 tokenizer->token_start = 0;
458 tokenizer->token_end = 0;
461 switch(tokenizer->action) {
464 = ubrk_open(UBRK_LINE, tokenizer->locale,
469 = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
474 = ubrk_open(UBRK_WORD, tokenizer->locale,
479 = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
484 = ubrk_open(UBRK_TITLE, tokenizer->locale,
488 *status = U_UNSUPPORTED_ERROR;
493 // ICU error stuff is a very funny business
494 if (U_SUCCESS(*status))
498 icu_tokenizer_destroy(tokenizer);
502 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
506 ubrk_close(tokenizer->bi);
511 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
512 struct icu_buf_utf16 * src16,
515 if (!tokenizer || !tokenizer->bi || !src16)
519 tokenizer->buf16 = src16;
520 tokenizer->token_count = 0;
521 tokenizer->token_id = 0;
522 tokenizer->token_start = 0;
523 tokenizer->token_end = 0;
525 ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
528 if (U_FAILURE(*status))
534 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
535 struct icu_buf_utf16 * tkn16,
538 int32_t tkn_start = 0;
543 if (!tokenizer || !tokenizer->bi
544 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
547 // never change tokenizer->buf16 and keep always invariant
548 // 0 <= tokenizer->token_start
549 // <= tokenizer->token_end
550 // <= tokenizer->buf16->utf16_len
551 // returns length of token
553 if (0 == tokenizer->token_end) // first call
554 tkn_start = ubrk_first(tokenizer->bi);
555 else //successive calls
556 tkn_start = tokenizer->token_end;
559 tkn_end = ubrk_next(tokenizer->bi);
561 // repairing invariant at end of ubrk, which is UBRK_DONE = -1
562 if (UBRK_DONE == tkn_end)
563 tkn_end = tokenizer->buf16->utf16_len;
565 // copy out if everything is well
566 if(U_FAILURE(*status))
569 // everything OK, now update internal state
570 tkn_len = tkn_end - tkn_start;
573 tokenizer->token_count++;
574 tokenizer->token_id++;
576 tokenizer->token_id = 0;
578 tokenizer->token_start = tkn_start;
579 tokenizer->token_end = tkn_end;
582 // copying into token buffer if it exists
584 if (tkn16->utf16_cap < tkn_len)
585 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
587 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
590 tkn16->utf16_len = tkn_len;
597 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
599 return tokenizer->token_id;
602 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
604 return tokenizer->token_start;
607 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
609 return tokenizer->token_end;
612 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
614 return (tokenizer->token_end - tokenizer->token_start);
617 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
619 return tokenizer->token_count;
624 //struct icu_normalizer
627 // struct icu_buf_utf16 * rules16;
628 // UParseError parse_error[256];
629 // UTransliterator * trans;
633 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
637 struct icu_normalizer * normalizer
638 = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
640 normalizer->action = action;
641 normalizer->trans = 0;
642 normalizer->rules16 = icu_buf_utf16_create(0);
643 icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
645 switch(normalizer->action) {
648 = utrans_openU(normalizer->rules16->utf16,
649 normalizer->rules16->utf16_len,
652 normalizer->parse_error, status);
656 = utrans_openU(normalizer->rules16->utf16,
657 normalizer->rules16->utf16_len,
660 normalizer->parse_error, status);
663 *status = U_UNSUPPORTED_ERROR;
668 if (U_SUCCESS(*status))
672 icu_normalizer_destroy(normalizer);
677 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
679 if (normalizer->rules16)
680 icu_buf_utf16_destroy(normalizer->rules16);
681 if (normalizer->trans)
682 utrans_close(normalizer->trans);
689 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
690 struct icu_buf_utf16 * dest16,
691 struct icu_buf_utf16 * src16,
694 if (!normalizer || !normalizer->trans || !src16 || !dest16)
697 if (!icu_buf_utf16_copy(dest16, src16))
700 utrans_transUChars (normalizer->trans,
701 dest16->utf16, &(dest16->utf16_len),
703 0, &(src16->utf16_len), status);
705 if (U_FAILURE(*status)){
706 dest16->utf16[0] = (UChar) 0;
707 dest16->utf16_len = 0;
710 return dest16->utf16_len;
716 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
717 enum icu_chain_step_type type,
718 const uint8_t * rule,
719 struct icu_buf_utf16 * buf16,
722 struct icu_chain_step * step = 0;
724 if(!chain || !type || !rule)
727 step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
730 step->more_tokens = 0;
737 // create auxilary objects
739 case ICU_chain_step_type_display:
741 case ICU_chain_step_type_norm:
743 case ICU_chain_step_type_sort:
745 case ICU_chain_step_type_charmap:
747 case ICU_chain_step_type_normalize:
748 step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
750 case ICU_chain_step_type_tokenize:
751 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
752 (char) rule[0], status);
762 void icu_chain_step_destroy(struct icu_chain_step * step){
767 icu_chain_step_destroy(step->previous);
770 case ICU_chain_step_type_display:
772 case ICU_chain_step_type_norm:
774 case ICU_chain_step_type_sort:
776 case ICU_chain_step_type_charmap:
777 icu_buf_utf16_destroy(step->buf16);
779 case ICU_chain_step_type_normalize:
780 icu_normalizer_destroy(step->u.normalizer);
781 icu_buf_utf16_destroy(step->buf16);
783 case ICU_chain_step_type_tokenize:
784 icu_tokenizer_destroy(step->u.tokenizer);
785 icu_buf_utf16_destroy(step->buf16);
796 struct icu_chain * icu_chain_create(const uint8_t * identifier,
797 const uint8_t * locale)
800 struct icu_chain * chain
801 = (struct icu_chain *) malloc(sizeof(struct icu_chain));
803 strncpy((char *) chain->identifier, (const char *) identifier, 128);
804 chain->identifier[128 - 1] = '\0';
805 strncpy((char *) chain->locale, (const char *) locale, 16);
806 chain->locale[16 - 1] = '\0';
808 chain->token_count = 0;
810 chain->display8 = icu_buf_utf8_create(0);
811 chain->norm8 = icu_buf_utf8_create(0);
812 chain->sort8 = icu_buf_utf8_create(0);
814 chain->src16 = icu_buf_utf16_create(0);
822 void icu_chain_destroy(struct icu_chain * chain)
824 icu_buf_utf8_destroy(chain->display8);
825 icu_buf_utf8_destroy(chain->norm8);
826 icu_buf_utf8_destroy(chain->sort8);
828 icu_buf_utf16_destroy(chain->src16);
830 icu_chain_step_destroy(chain->steps);
834 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
835 enum icu_chain_step_type type,
836 const uint8_t * rule,
839 struct icu_chain_step * step = 0;
840 struct icu_buf_utf16 * src16 = 0;
841 struct icu_buf_utf16 * buf16 = 0;
843 if (!chain || !type || !rule)
846 // assign utf16 src buffers as needed
847 if (chain->steps && chain->steps->buf16)
848 src16 = chain->steps->buf16;
849 else if (chain->src16)
850 src16 = chain->src16;
855 // assign utf16 destination buffers as needed, or
856 // re-use previous uft18 buffer if this step does not touch it
858 case ICU_chain_step_type_display:
861 case ICU_chain_step_type_norm:
864 case ICU_chain_step_type_sort:
867 case ICU_chain_step_type_charmap:
868 buf16 = icu_buf_utf16_create(0);
870 case ICU_chain_step_type_normalize:
871 buf16 = icu_buf_utf16_create(0);
873 case ICU_chain_step_type_tokenize:
874 buf16 = icu_buf_utf16_create(0);
880 // create actual chain step with this buffer
881 step = icu_chain_step_create(chain, type, rule, buf16, status);
883 step->previous = chain->steps;
890 int icu_chain_step_next_token(struct icu_chain * chain,
891 struct icu_chain_step * step,
894 struct icu_buf_utf16 * src16 = 0;
896 printf("icu_chain_step_next_token %d\n", (int) step);
898 if (!chain || !chain->src16 || !step || !step->more_tokens)
901 // assign utf16 src buffers as neeed, advance in previous steps
902 // tokens, and setting stop condition
904 src16 = step->previous->buf16;
906 = icu_chain_step_next_token(chain, step->previous, status);
908 else { // first step can only work once on chain->src16 input buffer
909 src16 = chain->src16;
910 step->more_tokens = 1;
913 // stop if nothing to process
914 // i.e new token source was not properly assigned
915 if (!step->more_tokens || !src16 || !src16->utf16_len) //
918 printf("icu_chain_step_next_token %d working\n", (int) step);
921 // perform the work, eventually put this steps output in
922 // step->buf16 or the chains UTF8 output buffers
924 case ICU_chain_step_type_display:
925 icu_utf16_to_utf8(chain->display8, src16, status);
927 case ICU_chain_step_type_norm:
928 icu_utf16_to_utf8(chain->norm8, src16, status);
930 case ICU_chain_step_type_sort:
931 icu_utf16_to_utf8(chain->sort8, src16, status);
933 case ICU_chain_step_type_charmap:
935 case ICU_chain_step_type_normalize:
936 icu_normalizer_normalize(step->u.normalizer,
937 step->buf16, src16, status);
939 case ICU_chain_step_type_tokenize:
941 // = icu_tokenizer_next_token(step->u.tokenizer,
942 // step->buf16, status);
950 // stop further token processing if last step
952 step->more_tokens = 0;
955 if (U_FAILURE(*status))
963 int icu_chain_assign_cstr(struct icu_chain * chain,
964 const char * src8cstr,
967 struct icu_chain_step * stp = chain->steps;
969 if (!chain || !src8cstr)
973 chain->token_count = 0;
975 // clear all steps stop states
978 stp->more_tokens = 1;
982 // finally convert UTF8 to UTF16 string
983 icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
985 if (U_FAILURE(*status))
993 int icu_chain_next_token(struct icu_chain * chain,
998 if (!chain || !chain->steps)
1001 success = icu_chain_step_next_token(chain, chain->steps, status);
1004 chain->token_count++;
1005 return chain->token_count;
1011 int icu_chain_get_token_count(struct icu_chain * chain)
1016 return chain->token_count;
1021 const char * icu_chain_get_display(struct icu_chain * chain)
1023 if (chain->display8)
1024 return (const char *) chain->display8->utf8;
1029 const char * icu_chain_get_norm(struct icu_chain * chain)
1032 return (const char *) chain->norm8->utf8;
1037 const char * icu_chain_get_sort(struct icu_chain * chain)
1040 return (const char *) chain->sort8->utf8;
1056 * indent-tabs-mode: nil
1058 * vim: shiftwidth=4 tabstop=8 expandtab