src/icu_I18N.c

   1 /* $Id: icu_I18N.c,v 1.14 2007-05-16 12:39:49 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 #if HAVE_CONFIG_H
  23 #include "cconfig.h"
  24 #endif
  25
  26 #define USE_TIMING 0
  27 #if USE_TIMING
  28 #include <yaz/timing.h>
  29 #endif
  30
  31
  32 #ifdef HAVE_ICU
  33 #include "icu_I18N.h"
  34
  35 #include <yaz/log.h>
  36
  37 #include <string.h>
  38 #include <stdlib.h>
  39 #include <stdio.h>
  40
  41 #include <unicode/ustring.h>  /* some more string fcns*/
  42 #include <unicode/uchar.h>    /* char names           */
  43
  44
  45 //#include <unicode/ustdio.h>
  46 //#include <unicode/utypes.h>   /* Basic ICU data types */
  47 #include <unicode/ucol.h>
  48 //#include <unicode/ucnv.h>     /* C   Converter API    */
  49 //#include <unicode/uloc.h>
  50 //#include <unicode/ubrk.h>
  51 /* #include <unicode/unistr.h> */
  52
  53
  54
  55
  56 int icu_check_status (UErrorCode status)
  57 {
  58     if(U_FAILURE(status)){
  59         yaz_log(YLOG_WARN,
  60                 "ICU: %d %s\n", status, u_errorName(status));
  61         return 0;
  62     }
  63     return 1;
  64
  65 }
  66
  67
  68
  69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  70 {
  71     struct icu_buf_utf16 * buf16
  72         = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
  73
  74     buf16->utf16 = 0;
  75     buf16->utf16_len = 0;
  76     buf16->utf16_cap = 0;
  77
  78     if (capacity > 0){
  79         buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  80         buf16->utf16[0] = (UChar) 0;
  81         buf16->utf16_cap = capacity;
  82     }
  83     return buf16;
  84 };
  85
  86
  87 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  88                                             size_t capacity)
  89 {
  90     if (buf16){
  91         if (capacity >  0){
  92             if (0 == buf16->utf16)
  93                 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  94             else
  95                 buf16->utf16
  96                     = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
  97             buf16->utf16[0] = (UChar) 0;
  98             buf16->utf16_len = 0;
  99             buf16->utf16_cap = capacity;
 100         }
 101         else {
 102             if (buf16->utf16)
 103                 free(buf16->utf16);
 104             buf16->utf16 = 0;
 105             buf16->utf16_len = 0;
 106             buf16->utf16_cap = 0;
 107         }
 108     }
 109
 110     return buf16;
 111 };
 112
 113
 114 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
 115                                           struct icu_buf_utf16 * src16)
 116 {
 117     if(!dest16 || !src16
 118        || dest16 == src16)
 119         return 0;
 120
 121     if (dest16->utf16_cap < src16->utf16_len)
 122         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 123
 124     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 125
 126     return dest16;
 127 };
 128
 129
 130 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 131 {
 132     if (buf16){
 133         if (buf16->utf16)
 134             free(buf16->utf16);
 135         free(buf16);
 136     }
 137 };
 138
 139
 140
 141
 142
 143
 144 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 145 {
 146     struct icu_buf_utf8 * buf8
 147         = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
 148
 149     buf8->utf8 = 0;
 150     buf8->utf8_len = 0;
 151     buf8->utf8_cap = 0;
 152
 153     if (capacity > 0){
 154         buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 155         buf8->utf8[0] = (uint8_t) 0;
 156         buf8->utf8_cap = capacity;
 157     }
 158     return buf8;
 159 };
 160
 161
 162
 163 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 164                                           size_t capacity)
 165 {
 166     if (buf8){
 167         if (capacity >  0){
 168             if (0 == buf8->utf8)
 169                 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 170             else
 171                 buf8->utf8
 172                     = (uint8_t *) realloc(buf8->utf8,
 173                                           sizeof(uint8_t) * capacity);
 174             buf8->utf8[0] = (uint8_t) 0;
 175             buf8->utf8_len = 0;
 176             buf8->utf8_cap = capacity;
 177         }
 178         else {
 179             if (buf8->utf8)
 180                 free(buf8->utf8);
 181             buf8->utf8 = 0;
 182             buf8->utf8_len = 0;
 183             buf8->utf8_cap = 0;
 184         }
 185     }
 186
 187     return buf8;
 188 };
 189
 190
 191 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 192                                           struct icu_buf_utf8 * src8)
 193 {
 194     if(!dest8 || !src8
 195        || dest8 == src8)
 196         return 0;
 197
 198
 199     if (dest8->utf8_cap < src8->utf8_len)
 200         icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
 201
 202     strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
 203
 204     return dest8;
 205 };
 206
 207
 208
 209 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 210 {
 211     if (buf8){
 212         if (buf8->utf8)
 213             free(buf8->utf8);
 214         free(buf8);
 215     }
 216 };
 217
 218
 219
 220 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 221                                struct icu_buf_utf8 * src8,
 222                                UErrorCode * status)
 223 {
 224     int32_t utf16_len = 0;
 225
 226     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 227                   &utf16_len,
 228                   (const char *) src8->utf8, src8->utf8_len, status);
 229
 230     // check for buffer overflow, resize and retry
 231     if (*status == U_BUFFER_OVERFLOW_ERROR
 232         //|| dest16->utf16_len > dest16->utf16_cap
 233         ){
 234         icu_buf_utf16_resize(dest16, utf16_len * 2);
 235         *status = U_ZERO_ERROR;
 236         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 237                       &utf16_len,
 238                       (const char *) src8->utf8, src8->utf8_len, status);
 239     }
 240
 241     //if (*status != U_BUFFER_OVERFLOW_ERROR
 242     if (U_SUCCESS(*status)
 243         && utf16_len < dest16->utf16_cap)
 244         dest16->utf16_len = utf16_len;
 245     else {
 246         dest16->utf16[0] = (UChar) 0;
 247         dest16->utf16_len = 0;
 248     }
 249
 250     return *status;
 251 };
 252
 253
 254
 255 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 256                                     const char * src8cstr,
 257                                     UErrorCode * status)
 258 {
 259     size_t src8cstr_len = 0;
 260     int32_t utf16_len = 0;
 261
 262     src8cstr_len = strlen(src8cstr);
 263
 264     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 265                   &utf16_len,
 266                   src8cstr, src8cstr_len, status);
 267
 268     // check for buffer overflow, resize and retry
 269     if (*status == U_BUFFER_OVERFLOW_ERROR
 270         //|| dest16->utf16_len > dest16->utf16_cap
 271         ){
 272         icu_buf_utf16_resize(dest16, utf16_len * 2);
 273         *status = U_ZERO_ERROR;
 274         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 275                       &utf16_len,
 276                       src8cstr, src8cstr_len, status);
 277     }
 278
 279     //  if (*status != U_BUFFER_OVERFLOW_ERROR
 280     if (U_SUCCESS(*status)
 281         && utf16_len < dest16->utf16_cap)
 282         dest16->utf16_len = utf16_len;
 283     else {
 284         dest16->utf16[0] = (UChar) 0;
 285         dest16->utf16_len = 0;
 286     }
 287
 288     return *status;
 289 };
 290
 291
 292
 293
 294 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 295                              struct icu_buf_utf16 * src16,
 296                              UErrorCode * status)
 297 {
 298     int32_t utf8_len = 0;
 299
 300     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 301                 &utf8_len,
 302                 src16->utf16, src16->utf16_len, status);
 303
 304     // check for buffer overflow, resize and retry
 305     if (*status == U_BUFFER_OVERFLOW_ERROR
 306         //|| dest8->utf8_len > dest8->utf8_cap
 307         ){
 308         icu_buf_utf8_resize(dest8, utf8_len * 2);
 309         *status = U_ZERO_ERROR;
 310         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 311                     &utf8_len,
 312                     src16->utf16, src16->utf16_len, status);
 313
 314     }
 315
 316     //if (*status != U_BUFFER_OVERFLOW_ERROR
 317     if (U_SUCCESS(*status)
 318         && utf8_len < dest8->utf8_cap)
 319         dest8->utf8_len = utf8_len;
 320     else {
 321         dest8->utf8[0] = (uint8_t) 0;
 322         dest8->utf8_len = 0;
 323     }
 324
 325     return *status;
 326 };
 327
 328
 329
 330 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 331                       struct icu_buf_utf16 * src16,
 332                       const char *locale, char action,
 333                       UErrorCode *status)
 334 {
 335     int32_t dest16_len = 0;
 336
 337     switch(action) {
 338     case 'l':
 339         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 340                                   src16->utf16, src16->utf16_len,
 341                                   locale, status);
 342         break;
 343     case 'u':
 344         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 345                                   src16->utf16, src16->utf16_len,
 346                                   locale, status);
 347         break;
 348     case 't':
 349         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 350                                   src16->utf16, src16->utf16_len,
 351                                   0, locale, status);
 352         break;
 353     case 'f':
 354         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 355                                    src16->utf16, src16->utf16_len,
 356                                    U_FOLD_CASE_DEFAULT, status);
 357         break;
 358
 359     default:
 360         return U_UNSUPPORTED_ERROR;
 361         break;
 362     }
 363
 364     // check for buffer overflow, resize and retry
 365     if (*status == U_BUFFER_OVERFLOW_ERROR
 366         && dest16 != src16        // do not resize if in-place conversion
 367         //|| dest16_len > dest16->utf16_cap
 368         ){
 369         icu_buf_utf16_resize(dest16, dest16_len * 2);
 370         *status = U_ZERO_ERROR;
 371
 372
 373         switch(action) {
 374         case 'l':
 375             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 376                                       src16->utf16, src16->utf16_len,
 377                                       locale, status);
 378             break;
 379         case 'u':
 380             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 381                                       src16->utf16, src16->utf16_len,
 382                                       locale, status);
 383             break;
 384         case 't':
 385             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 386                                       src16->utf16, src16->utf16_len,
 387                                       0, locale, status);
 388             break;
 389         case 'f':
 390             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 391                                        src16->utf16, src16->utf16_len,
 392                                        U_FOLD_CASE_DEFAULT, status);
 393             break;
 394
 395         default:
 396             return U_UNSUPPORTED_ERROR;
 397             break;
 398         }
 399     }
 400
 401     if (U_SUCCESS(*status)
 402         && dest16_len < dest16->utf16_cap)
 403         dest16->utf16_len = dest16_len;
 404     else {
 405         dest16->utf16[0] = (UChar) 0;
 406         dest16->utf16_len = 0;
 407     }
 408
 409     return *status;
 410 };
 411
 412
 413
 414 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 415                                    struct icu_buf_utf8 * dest8,
 416                                    struct icu_buf_utf16 * src16,
 417                                    UErrorCode * status)
 418 {
 419
 420     int32_t sortkey_len = 0;
 421
 422     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 423                                   dest8->utf8, dest8->utf8_cap);
 424
 425     // check for buffer overflow, resize and retry
 426     if (sortkey_len > dest8->utf8_cap) {
 427         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 428         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 429                                       dest8->utf8, dest8->utf8_cap);
 430     }
 431
 432     if (U_SUCCESS(*status)
 433         && sortkey_len > 0)
 434         dest8->utf8_len = sortkey_len;
 435     else {
 436         dest8->utf8[0] = (UChar) 0;
 437         dest8->utf8_len = 0;
 438     }
 439
 440     return sortkey_len;
 441 };
 442
 443
 444
 445 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 446                                             UErrorCode *status)
 447 {
 448     struct icu_tokenizer * tokenizer
 449         = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
 450
 451     strcpy(tokenizer->locale, locale);
 452     tokenizer->action = action;
 453     tokenizer->bi = 0;
 454     tokenizer->buf16 = 0;
 455     tokenizer->token_count = 0;
 456     tokenizer->token_id = 0;
 457     tokenizer->token_start = 0;
 458     tokenizer->token_end = 0;
 459
 460
 461     switch(tokenizer->action) {
 462     case 'l':
 463         tokenizer->bi
 464             = ubrk_open(UBRK_LINE, tokenizer->locale,
 465                         0, 0, status);
 466         break;
 467     case 's':
 468         tokenizer->bi
 469             = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
 470                         0, 0, status);
 471         break;
 472     case 'w':
 473         tokenizer->bi
 474             = ubrk_open(UBRK_WORD, tokenizer->locale,
 475                         0, 0, status);
 476         break;
 477     case 'c':
 478         tokenizer->bi
 479             = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
 480                         0, 0, status);
 481         break;
 482     case 't':
 483         tokenizer->bi
 484             = ubrk_open(UBRK_TITLE, tokenizer->locale,
 485                         0, 0, status);
 486         break;
 487     default:
 488         *status = U_UNSUPPORTED_ERROR;
 489         return 0;
 490         break;
 491     }
 492
 493     // ICU error stuff is a very  funny business
 494     if (U_SUCCESS(*status))
 495         return tokenizer;
 496
 497     // freeing if failed
 498     icu_tokenizer_destroy(tokenizer);
 499     return 0;
 500 };
 501
 502 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 503 {
 504     if (tokenizer) {
 505         if (tokenizer->bi)
 506             ubrk_close(tokenizer->bi);
 507         free(tokenizer);
 508     }
 509 };
 510
 511 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 512                          struct icu_buf_utf16 * src16,
 513                          UErrorCode *status)
 514 {
 515     if (!tokenizer || !tokenizer->bi || !src16)
 516         return 0;
 517
 518
 519     tokenizer->buf16 = src16;
 520     tokenizer->token_count = 0;
 521     tokenizer->token_id = 0;
 522     tokenizer->token_start = 0;
 523     tokenizer->token_end = 0;
 524
 525     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 526
 527
 528     if (U_FAILURE(*status))
 529         return 0;
 530
 531     return 1;
 532 };
 533
 534 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 535                          struct icu_buf_utf16 * tkn16,
 536                          UErrorCode *status)
 537 {
 538     int32_t tkn_start = 0;
 539     int32_t tkn_end = 0;
 540     int32_t tkn_len = 0;
 541
 542
 543     if (!tokenizer || !tokenizer->bi
 544         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 545         return 0;
 546
 547     // never change tokenizer->buf16 and keep always invariant
 548     // 0 <= tokenizer->token_start
 549     //   <= tokenizer->token_end
 550     //   <= tokenizer->buf16->utf16_len
 551     // returns length of token
 552
 553     if (0 == tokenizer->token_end) // first call
 554         tkn_start = ubrk_first(tokenizer->bi);
 555     else //successive calls
 556         tkn_start = tokenizer->token_end;
 557
 558     // get next position
 559     tkn_end = ubrk_next(tokenizer->bi);
 560
 561     // repairing invariant at end of ubrk, which is UBRK_DONE = -1
 562     if (UBRK_DONE == tkn_end)
 563         tkn_end = tokenizer->buf16->utf16_len;
 564
 565     // copy out if everything is well
 566     if(U_FAILURE(*status))
 567         return 0;
 568
 569     // everything OK, now update internal state
 570     tkn_len = tkn_end - tkn_start;
 571
 572     if (0 < tkn_len){
 573         tokenizer->token_count++;
 574         tokenizer->token_id++;
 575     } else {
 576         tokenizer->token_id = 0;
 577     }
 578     tokenizer->token_start = tkn_start;
 579     tokenizer->token_end = tkn_end;
 580
 581
 582     // copying into token buffer if it exists
 583     if (tkn16){
 584         if (tkn16->utf16_cap < tkn_len)
 585             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 586
 587         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 588                   tkn_len);
 589
 590         tkn16->utf16_len = tkn_len;
 591     }
 592
 593     return tkn_len;
 594 }
 595
 596
 597 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 598 {
 599     return tokenizer->token_id;
 600 };
 601
 602 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 603 {
 604     return tokenizer->token_start;
 605 };
 606
 607 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 608 {
 609     return tokenizer->token_end;
 610 };
 611
 612 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 613 {
 614     return (tokenizer->token_end - tokenizer->token_start);
 615 };
 616
 617 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 618 {
 619     return tokenizer->token_count;
 620 };
 621
 622
 623
 624 //struct icu_normalizer
 625 //{
 626 //  char action;
 627 //  struct icu_buf_utf16 * rules16;
 628 //  UParseError parse_error[256];
 629 //  UTransliterator * trans;
 630 //};
 631
 632
 633 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
 634                                               UErrorCode *status)
 635 {
 636
 637     struct icu_normalizer * normalizer
 638         = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
 639
 640     normalizer->action = action;
 641     normalizer->trans = 0;
 642     normalizer->rules16 =  icu_buf_utf16_create(0);
 643     icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
 644
 645     switch(normalizer->action) {
 646     case 'f':
 647         normalizer->trans
 648             = utrans_openU(normalizer->rules16->utf16,
 649                            normalizer->rules16->utf16_len,
 650                            UTRANS_FORWARD,
 651                            0, 0,
 652                            normalizer->parse_error, status);
 653         break;
 654     case 'r':
 655         normalizer->trans
 656             = utrans_openU(normalizer->rules16->utf16,
 657                            normalizer->rules16->utf16_len,
 658                            UTRANS_REVERSE ,
 659                            0, 0,
 660                            normalizer->parse_error, status);
 661         break;
 662     default:
 663         *status = U_UNSUPPORTED_ERROR;
 664         return 0;
 665         break;
 666     }
 667
 668     if (U_SUCCESS(*status))
 669         return normalizer;
 670
 671     // freeing if failed
 672     icu_normalizer_destroy(normalizer);
 673     return 0;
 674 };
 675
 676
 677 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
 678     if (normalizer) {
 679         if (normalizer->rules16)
 680             icu_buf_utf16_destroy(normalizer->rules16);
 681         if (normalizer->trans)
 682             utrans_close(normalizer->trans);
 683         free(normalizer);
 684     }
 685 };
 686
 687
 688
 689 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 690                              struct icu_buf_utf16 * dest16,
 691                              struct icu_buf_utf16 * src16,
 692                              UErrorCode *status)
 693 {
 694     if (!normalizer || !normalizer->trans || !src16 || !dest16)
 695         return 0;
 696
 697     if (!icu_buf_utf16_copy(dest16, src16))
 698         return 0;
 699
 700     utrans_transUChars (normalizer->trans,
 701                         dest16->utf16, &(dest16->utf16_len),
 702                         dest16->utf16_cap,
 703                         0, &(src16->utf16_len), status);
 704
 705     if (U_FAILURE(*status)){
 706         dest16->utf16[0] = (UChar) 0;
 707         dest16->utf16_len = 0;
 708     }
 709
 710     return dest16->utf16_len;
 711 }
 712
 713
 714
 715
 716 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 717                                               enum icu_chain_step_type type,
 718                                               const uint8_t * rule,
 719                                               struct icu_buf_utf16 * buf16,
 720                                               UErrorCode *status)
 721 {
 722     struct icu_chain_step * step = 0;
 723
 724     if(!chain || !type || !rule)
 725         return 0;
 726
 727     step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
 728
 729     step->type = type;
 730     step->more_tokens = 0;
 731
 732     if (buf16)
 733         step->buf16 = buf16;
 734     else
 735         step->buf16 = 0;
 736
 737     // create auxilary objects
 738     switch(step->type) {
 739     case ICU_chain_step_type_display:
 740         break;
 741     case ICU_chain_step_type_norm:
 742         break;
 743     case ICU_chain_step_type_sort:
 744         break;
 745     case ICU_chain_step_type_charmap:
 746         break;
 747     case ICU_chain_step_type_normalize:
 748         step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
 749         break;
 750     case ICU_chain_step_type_tokenize:
 751         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 752                                                  (char) rule[0], status);
 753         break;
 754     default:
 755         break;
 756     }
 757
 758     return step;
 759 };
 760
 761
 762 void icu_chain_step_destroy(struct icu_chain_step * step){
 763
 764     if (!step)
 765         return;
 766
 767     icu_chain_step_destroy(step->previous);
 768
 769     switch(step->type) {
 770     case ICU_chain_step_type_display:
 771         break;
 772     case ICU_chain_step_type_norm:
 773         break;
 774     case ICU_chain_step_type_sort:
 775         break;
 776     case ICU_chain_step_type_charmap:
 777         icu_buf_utf16_destroy(step->buf16);
 778         break;
 779     case ICU_chain_step_type_normalize:
 780         icu_normalizer_destroy(step->u.normalizer);
 781         icu_buf_utf16_destroy(step->buf16);
 782         break;
 783     case ICU_chain_step_type_tokenize:
 784         icu_tokenizer_destroy(step->u.tokenizer);
 785         icu_buf_utf16_destroy(step->buf16);
 786         break;
 787     default:
 788         break;
 789     }
 790
 791
 792 };
 793
 794
 795
 796 struct icu_chain * icu_chain_create(const uint8_t * identifier,
 797                                     const uint8_t * locale)
 798 {
 799
 800     struct icu_chain * chain
 801         = (struct icu_chain *) malloc(sizeof(struct icu_chain));
 802
 803     strncpy((char *) chain->identifier, (const char *) identifier, 128);
 804     chain->identifier[128 - 1] = '\0';
 805     strncpy((char *) chain->locale, (const char *) locale, 16);
 806     chain->locale[16 - 1] = '\0';
 807
 808     chain->token_count = 0;
 809
 810     chain->display8 = icu_buf_utf8_create(0);
 811     chain->norm8 = icu_buf_utf8_create(0);
 812     chain->sort8 = icu_buf_utf8_create(0);
 813
 814     chain->src16 = icu_buf_utf16_create(0);
 815
 816     chain->steps = 0;
 817
 818     return chain;
 819 };
 820
 821
 822 void icu_chain_destroy(struct icu_chain * chain)
 823 {
 824     icu_buf_utf8_destroy(chain->display8);
 825     icu_buf_utf8_destroy(chain->norm8);
 826     icu_buf_utf8_destroy(chain->sort8);
 827
 828     icu_buf_utf16_destroy(chain->src16);
 829
 830     icu_chain_step_destroy(chain->steps);
 831 };
 832
 833
 834 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 835                                               enum icu_chain_step_type type,
 836                                               const uint8_t * rule,
 837                                               UErrorCode *status)
 838 {
 839     struct icu_chain_step * step = 0;
 840     struct icu_buf_utf16 * src16 = 0;
 841     struct icu_buf_utf16 * buf16 = 0;
 842
 843     if (!chain || !type || !rule)
 844         return 0;
 845
 846     // assign utf16 src buffers as needed
 847     if (chain->steps && chain->steps->buf16)
 848         src16 = chain->steps->buf16;
 849     else if (chain->src16)
 850         src16 = chain->src16;
 851     else
 852         return 0;
 853
 854
 855     // assign utf16 destination buffers as needed, or
 856     // re-use previous uft18 buffer if this step does not touch it
 857     switch(type) {
 858     case ICU_chain_step_type_display:
 859         buf16 = src16;
 860         break;
 861     case ICU_chain_step_type_norm:
 862         buf16 = src16;
 863         break;
 864     case ICU_chain_step_type_sort:
 865         buf16 = src16;
 866         break;
 867     case ICU_chain_step_type_charmap:
 868         buf16 = icu_buf_utf16_create(0);
 869         break;
 870     case ICU_chain_step_type_normalize:
 871         buf16 = icu_buf_utf16_create(0);
 872         break;
 873     case ICU_chain_step_type_tokenize:
 874         buf16 = icu_buf_utf16_create(0);
 875         break;
 876     default:
 877         break;
 878     }
 879
 880     // create actual chain step with this buffer
 881     step = icu_chain_step_create(chain, type, rule, buf16, status);
 882
 883     step->previous = chain->steps;
 884     chain->steps = step;
 885
 886     return step;
 887 };
 888
 889
 890 int icu_chain_step_next_token(struct icu_chain * chain,
 891                               struct icu_chain_step * step,
 892                               UErrorCode *status)
 893 {
 894     struct icu_buf_utf16 * src16 = 0;
 895
 896     printf("icu_chain_step_next_token %d\n", (int) step);
 897
 898     if (!chain || !chain->src16 || !step || !step->more_tokens)
 899         return 0;
 900
 901     // assign utf16 src buffers as neeed, advance in previous steps
 902     // tokens, and setting stop condition
 903     if (step->previous){
 904         src16 = step->previous->buf16;
 905         step->more_tokens
 906             = icu_chain_step_next_token(chain, step->previous, status);
 907     }
 908     else { // first step can only work once on chain->src16 input buffer
 909         src16 = chain->src16;
 910         step->more_tokens = 1;
 911     }
 912
 913     // stop if nothing to process
 914     // i.e new token source was not properly assigned
 915     if (!step->more_tokens || !src16 || !src16->utf16_len) //
 916         return 0;
 917
 918     printf("icu_chain_step_next_token %d working\n", (int) step);
 919
 920
 921     // perform the work, eventually put this steps output in
 922     // step->buf16 or the chains UTF8 output buffers
 923     switch(step->type) {
 924     case ICU_chain_step_type_display:
 925         icu_utf16_to_utf8(chain->display8, src16, status);
 926         break;
 927     case ICU_chain_step_type_norm:
 928         icu_utf16_to_utf8(chain->norm8, src16, status);
 929         break;
 930     case ICU_chain_step_type_sort:
 931         icu_utf16_to_utf8(chain->sort8, src16, status);
 932         break;
 933     case ICU_chain_step_type_charmap:
 934         break;
 935     case ICU_chain_step_type_normalize:
 936         icu_normalizer_normalize(step->u.normalizer,
 937                                  step->buf16, src16, status);
 938         break;
 939     case ICU_chain_step_type_tokenize:
 940         // step->more_tokens
 941         //       = icu_tokenizer_next_token(step->u.tokenizer,
 942         //                               step->buf16, status);
 943         break;
 944     default:
 945         return 0;
 946         break;
 947     }
 948
 949
 950     // stop further token processing if last step
 951     if (!step->previous)
 952         step->more_tokens = 0;
 953
 954
 955     if (U_FAILURE(*status))
 956         return 0;
 957
 958     return 1;
 959 };
 960
 961
 962
 963 int icu_chain_assign_cstr(struct icu_chain * chain,
 964                           const char * src8cstr,
 965                           UErrorCode *status)
 966 {
 967     struct icu_chain_step * stp = chain->steps;
 968
 969     if (!chain || !src8cstr)
 970         return 0;
 971
 972     // clear token count
 973     chain->token_count = 0;
 974
 975     // clear all steps stop states
 976
 977     while (stp){
 978         stp->more_tokens = 1;
 979         stp = stp->previous;
 980     }
 981
 982     // finally convert UTF8 to UTF16 string
 983     icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
 984
 985     if (U_FAILURE(*status))
 986         return 0;
 987
 988     return 1;
 989 };
 990
 991
 992
 993 int icu_chain_next_token(struct icu_chain * chain,
 994                          UErrorCode *status)
 995 {
 996     int success = 0;
 997
 998     if (!chain || !chain->steps)
 999         return 0;
1000
1001     success = icu_chain_step_next_token(chain, chain->steps, status);
1002
1003     if (success){
1004         chain->token_count++;
1005         return chain->token_count;
1006     }
1007
1008     return 0;
1009 };
1010
1011 int icu_chain_get_token_count(struct icu_chain * chain)
1012 {
1013     if (!chain)
1014         return 0;
1015
1016     return chain->token_count;
1017 };
1018
1019
1020
1021 const char * icu_chain_get_display(struct icu_chain * chain)
1022 {
1023     if (chain->display8)
1024         return (const char *) chain->display8->utf8;
1025
1026     return 0;
1027 };
1028
1029 const char * icu_chain_get_norm(struct icu_chain * chain)
1030 {
1031     if (chain->norm8)
1032         return (const char *) chain->norm8->utf8;
1033
1034     return 0;
1035 };
1036
1037 const char * icu_chain_get_sort(struct icu_chain * chain)
1038 {
1039     if (chain->sort8)
1040         return (const char *) chain->sort8->utf8;
1041
1042     return 0;
1043 };
1044
1045
1046
1047
1048 #endif // HAVE_ICU
1049
1050
1051
1052
1053 /*
1054  * Local variables:
1055  * c-basic-offset: 4
1056  * indent-tabs-mode: nil
1057  * End:
1058  * vim: shiftwidth=4 tabstop=8 expandtab
1059  */