src/test_icu_I18N.c

   1 /* $Id: test_icu_I18N.c,v 1.19 2007-05-16 12:39:49 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  23
  24
  25 #if HAVE_CONFIG_H
  26 #include "cconfig.h"
  27 #endif
  28
  29 #define USE_TIMING 0
  30 #if USE_TIMING
  31 #include <yaz/timing.h>
  32 #endif
  33
  34 #include <yaz/test.h>
  35
  36
  37
  38 #ifdef HAVE_ICU
  39 #include "icu_I18N.h"
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 //#include <unicode/ustring.h>
  45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  46
  47
  48 #define MAX_KEY_SIZE 256
  49 struct icu_termmap
  50 {
  51     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  52     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  53 };
  54
  55
  56
  57 int icu_termmap_cmp(const void *vp1, const void *vp2)
  58 {
  59     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  60     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  61
  62     int cmp = 0;
  63
  64     cmp = strcmp((const char *)itmp1->sort_key,
  65                  (const char *)itmp2->sort_key);
  66     return cmp;
  67 };
  68
  69
  70
  71
  72 int test_icu_casemap(const char * locale, char action,
  73                      const char * src8cstr, const char * chk8cstr)
  74 {
  75     int success = 0;
  76     UErrorCode status = U_ZERO_ERROR;
  77
  78     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
  79     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
  80     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
  81     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
  82
  83
  84     int src8cstr_len = strlen(src8cstr);
  85     int chk8cstr_len = strlen(chk8cstr);
  86
  87     // converting to UTF16
  88     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
  89
  90     // perform case mapping
  91     icu_utf16_casemap(dest16, src16, locale, action, &status);
  92
  93     // converting to UTF8
  94     icu_utf16_to_utf8(dest8, dest16, &status);
  95
  96
  97
  98     // determine success
  99     if (dest8->utf8
 100         && (dest8->utf8_len == strlen(chk8cstr))
 101         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 102         success = 1;
 103     else
 104         success = 0;
 105
 106     // report failures
 107     if (!success){
 108         printf("\nERROR\n");
 109         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 110         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 111                locale, action, dest8->utf8, dest8->utf8_len);
 112         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 113     }
 114
 115     // clean the buffers
 116     icu_buf_utf8_destroy(src8);
 117     icu_buf_utf8_destroy(dest8);
 118     icu_buf_utf16_destroy(src16);
 119     icu_buf_utf16_destroy(dest16);
 120
 121
 122     return success;
 123 }
 124
 125
 126
 127 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 128
 129 void test_icu_I18N_casemap(int argc, char **argv)
 130 {
 131
 132     // Locale 'en'
 133
 134     // sucessful tests
 135     YAZ_CHECK(test_icu_casemap("en", 'l',
 136                                "A ReD fOx hunTS sQUirriLs",
 137                                "a red fox hunts squirrils"));
 138
 139     YAZ_CHECK(test_icu_casemap("en", 'u',
 140                                "A ReD fOx hunTS sQUirriLs",
 141                                "A RED FOX HUNTS SQUIRRILS"));
 142
 143     YAZ_CHECK(test_icu_casemap("en", 'f',
 144                                "A ReD fOx hunTS sQUirriLs",
 145                                "a red fox hunts squirrils"));
 146
 147     YAZ_CHECK(test_icu_casemap("en", 't',
 148                                "A ReD fOx hunTS sQUirriLs",
 149                                "A Red Fox Hunts Squirrils"));
 150
 151
 152     // Locale 'da'
 153
 154     // sucess expected
 155     YAZ_CHECK(test_icu_casemap("da", 'l',
 156                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 157                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 158
 159     YAZ_CHECK(test_icu_casemap("da", 'u',
 160                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 161                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 162
 163     YAZ_CHECK(test_icu_casemap("da", 'f',
 164                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 165                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 166
 167     YAZ_CHECK(test_icu_casemap("da", 't',
 168                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 169                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 170
 171     // Locale 'de'
 172
 173     // sucess expected
 174     YAZ_CHECK(test_icu_casemap("de", 'l',
 175                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 176                                "zwölf ärgerliche würste rollen über die straße"));
 177
 178     YAZ_CHECK(test_icu_casemap("de", 'u',
 179                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 180                                "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 181
 182     YAZ_CHECK(test_icu_casemap("de", 'f',
 183                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 184                                "zwölf ärgerliche würste rollen über die strasse"));
 185
 186     YAZ_CHECK(test_icu_casemap("de", 't',
 187                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 188                                "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 189
 190 }
 191
 192
 193 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 194
 195 int test_icu_sortmap(const char * locale, int src_list_len,
 196                      const char ** src_list, const char ** chk_list)
 197 {
 198     int success = 1;
 199
 200     UErrorCode status = U_ZERO_ERROR;
 201
 202     struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 203     struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 204
 205     int i;
 206
 207     struct icu_termmap * list[src_list_len];
 208
 209     UCollator *coll = ucol_open(locale, &status);
 210     icu_check_status(status);
 211
 212     if(U_FAILURE(status))
 213         return 0;
 214
 215     // assigning display terms and sort keys using buf 8 and buf16
 216     for( i = 0; i < src_list_len; i++)
 217         {
 218
 219             list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 220
 221             // copy display term
 222             strcpy(list[i]->disp_term, src_list[i]);
 223
 224             // transforming to UTF16
 225             icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 226             icu_check_status(status);
 227
 228             // computing sortkeys
 229             icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 230             icu_check_status(status);
 231
 232             // assigning sortkeys
 233             memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 234             //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 235             //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 236         }
 237
 238
 239     // do the sorting
 240     qsort(list, src_list_len,
 241           sizeof(struct icu_termmap *), icu_termmap_cmp);
 242
 243     // checking correct sorting
 244     for (i = 0; i < src_list_len; i++){
 245         if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 246             success = 0;
 247         }
 248     }
 249
 250     if(!success){
 251         printf("\nERROR\n");
 252         printf("Input str: '%s' : ", locale);
 253         for (i = 0; i < src_list_len; i++) {
 254             printf(" '%s'", list[i]->disp_term);
 255         }
 256         printf("\n");
 257         printf("ICU sort:  '%s' : ", locale);
 258         for (i = 0; i < src_list_len; i++) {
 259             printf(" '%s'", list[i]->disp_term);
 260             //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 261         }
 262         printf("\n");
 263         printf("Expected:  '%s' : ", locale);
 264         for (i = 0; i < src_list_len; i++) {
 265             printf(" '%s'", chk_list[i]);
 266         }
 267         printf("\n");
 268     }
 269
 270
 271
 272     for( i = 0; i < src_list_len; i++)
 273         free(list[i]);
 274
 275
 276     ucol_close(coll);
 277
 278     icu_buf_utf8_destroy(buf8);
 279     icu_buf_utf16_destroy(buf16);
 280
 281     return success;
 282 }
 283
 284
 285 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 286
 287 void test_icu_I18N_sortmap(int argc, char **argv)
 288 {
 289
 290     // sucessful tests
 291     size_t en_1_len = 6;
 292     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 293     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 294     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 295     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 296     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 297     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 298     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 299
 300     // sucessful tests
 301     size_t da_1_len = 6;
 302     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 303     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 304     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 305     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 306
 307     // sucessful tests
 308     size_t de_1_len = 9;
 309     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 310     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 311     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 312     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 313     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 314
 315 }
 316
 317
 318 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 319
 320
 321
 322
 323 int test_icu_transliterator(const char * rules8cstr,
 324                             const char * src8cstr,
 325                             const char * chk8cstr)
 326 {
 327     int success = 0;
 328
 329     UErrorCode status = U_ZERO_ERROR;
 330     UParseError parse_error[256];
 331
 332
 333     struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0);
 334     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 335     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
 336     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
 337
 338     icu_utf16_from_utf8_cstr(rules16, rules8cstr, &status);
 339     icu_check_status(status);
 340
 341     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 342     icu_check_status(status);
 343
 344
 345     struct icu_normalizer * normalizer
 346         = icu_normalizer_create((const char *) rules16, 'f', &status);
 347     icu_check_status(status);
 348
 349     icu_normalizer_normalize(normalizer, dest16, src16, &status);
 350
 351
 352     icu_utf16_to_utf8(dest8, src16, &status);
 353     icu_check_status(status);
 354
 355
 356     if(!strcmp((const char *) dest8->utf8,
 357                (const char *) chk8cstr))
 358         success = 1;
 359     else {
 360         success = 0;
 361         printf("Normaliozation;");
 362         printf("Rules:      '%s'\n", rules8cstr);
 363         printf("Input:      '%s'\n", src8cstr);
 364         printf("Normalized: '%s'\n", dest8->utf8);
 365         printf("Expected:   '%s'\n", chk8cstr);
 366     }
 367
 368
 369     icu_normalizer_destroy(normalizer);
 370     icu_buf_utf16_destroy(rules16);
 371     icu_buf_utf16_destroy(src16);
 372     icu_buf_utf16_destroy(dest16);
 373     icu_buf_utf8_destroy(dest8);
 374
 375     return success;
 376 }
 377
 378
 379 #if 0
 380
 381 int test_icu_transliterator(const char * rules8cstr,
 382                             const char * src8cstr,
 383                             const char * chk8cstr)
 384 {
 385     int success = 0;
 386
 387     UErrorCode status = U_ZERO_ERROR;
 388     UParseError parse_error[256];
 389
 390
 391     struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0);
 392     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 393     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
 394     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
 395
 396     icu_utf16_from_utf8_cstr(rules16, rules8cstr, &status);
 397     icu_check_status(status);
 398
 399     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 400     icu_check_status(status);
 401
 402     UTransliterator * trans
 403         = utrans_openU(rules16->utf16, rules16->utf16_len,
 404                        UTRANS_FORWARD,
 405                        0, 0,
 406                        parse_error, &status);
 407
 408     //= utrans_openU(0, 0, UTRANS_FORWARD,
 409     //                   rules16->utf16, rules16->utf16_len,
 410     //                   parse_error, &status);
 411
 412     icu_check_status(status);
 413     if(U_FAILURE(status)) {
 414       printf("Parse Error: \n line %d offset %d \n '%s'\n",
 415               parse_error->line, parse_error->offset,
 416              rules8cstr);
 417     }
 418
 419     utrans_transUChars(trans, src16->utf16, &(src16->utf16_len),
 420                         src16->utf16_cap,
 421                         0, &(src16->utf16_len), &status);
 422
 423     icu_utf16_to_utf8(dest8, src16, &status);
 424     icu_check_status(status);
 425
 426
 427     if(!strcmp((const char *) dest8->utf8,
 428                (const char *) chk8cstr))
 429         success = 1;
 430     else {
 431         success = 0;
 432         printf("Normaliozation;");
 433         printf("Rules:      '%s'\n", rules8cstr);
 434         printf("Input:      '%s'\n", src8cstr);
 435         printf("Normalized: '%s'\n", dest8->utf8);
 436         printf("Expected:   '%s'\n", chk8cstr);
 437     }
 438
 439
 440     utrans_close (trans);
 441     icu_buf_utf16_destroy(rules16);
 442     icu_buf_utf16_destroy(src16);
 443     icu_buf_utf16_destroy(dest16);
 444     icu_buf_utf8_destroy(dest8);
 445
 446     return success;
 447 }
 448
 449     printf("\n\nUnicode Set Patterns:\n"
 450              "   Pattern         Description\n"
 451              "   Ranges          [a-z]  The lower case letters a through z\n"
 452              "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
 453              "   String          [abc{def}] chars a, b and c, and string 'def'\n"
 454              "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
 455              "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
 456              "\n"
 457              "   Combination     Example\n"
 458              "   Union           [[:Greek:] [:letter:]]\n"
 459              "   Intersection    [[:Greek:] & [:letter:]]\n"
 460              "   Set Complement  [[:Greek:] - [:letter:]]\n"
 461              "   Complement      [^[:Greek:] [:letter:]]\n"
 462              "\n"
 463              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
 464              "\n"
 465              "Examples:\n"
 466              "   [:Punctuation:] Any-Remove\n"
 467              "   [:Cased-Letter:] Any-Upper\n"
 468              "   [:Control:] Any-Remove\n"
 469              "   [:Decimal_Number:] Any-Remove\n"
 470              "   [:Final_Punctuation:] Any-Remove\n"
 471              "   [:Georgian:] Any-Upper\n"
 472              "   [:Katakana:] Any-Remove\n"
 473              "   [:Arabic:] Any-Remove\n"
 474              "   [:Punctuation:] Remove\n"
 475              "   [[:Punctuation:]-[.,]] Remove\n"
 476              "   [:Line_Separator:] Any-Remove\n"
 477              "   [:Math_Symbol:] Any-Remove\n"
 478              "   Lower; [:^Letter:] Remove (word tokenization)\n"
 479              "   [:^Number:] Remove (numeric tokenization)\n"
 480              "   [:^Katagana:] Remove (remove everything except Katagana)\n"
 481              "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
 482              "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
 483              "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
 484              "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
 485              "\n"
 486              "see http://icu.sourceforge.net/userguide/Transform.html\n"
 487              "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
 488              "    http://icu.sourceforge.net/userguide/Transform.html\n"
 489              "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
 490              );
 491 #endif
 492
 493
 494 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 495
 496 void test_icu_I18N_transliterator(int argc, char **argv)
 497 {
 498
 499     YAZ_CHECK(test_icu_transliterator("[:Punctuation:] Any-Remove",
 500                                       "Don't shoot!",
 501                                       "Dont shoot"));
 502
 503     YAZ_CHECK(test_icu_transliterator("[:Control:] Any-Remove",
 504                                       "Don't\n shoot!",
 505                                       "Don't shoot!"));
 506
 507     YAZ_CHECK(test_icu_transliterator("[:Decimal_Number:] Any-Remove",
 508                                       "This is 4 you!",
 509                                       "This is  you!"));
 510
 511     YAZ_CHECK(test_icu_transliterator("Lower; [:^Letter:] Remove",
 512                                       "Don't shoot!",
 513                                       "dontshoot"));
 514
 515     YAZ_CHECK(test_icu_transliterator("[:^Number:] Remove",
 516                                       "Monday 15th of April",
 517                                       "15"));
 518
 519     YAZ_CHECK(test_icu_transliterator("Lower;"
 520                                       "[[:WhiteSpace:][:Punctuation:]] Remove",
 521                                       " word4you? ",
 522                                       "word4you"));
 523
 524
 525     YAZ_CHECK(test_icu_transliterator("NFD; [:Nonspacing Mark:] Remove; NFC",
 526                                       "à côté de l'alcôve ovoïde",
 527                                       "a cote de l'alcove ovoide"));
 528
 529 }
 530
 531
 532 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 533
 534 int test_icu_tokenizer(const char * locale, char action,
 535                      const char * src8cstr, int count)
 536 {
 537     int success = 1;
 538
 539     UErrorCode status = U_ZERO_ERROR;
 540     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 541     struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
 542     struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
 543
 544     //printf("Input:  '%s'\n", src8cstr);
 545
 546     // transforming to UTF16
 547     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 548     icu_check_status(status);
 549
 550     // set up tokenizer
 551     struct icu_tokenizer * tokenizer
 552         = icu_tokenizer_create(locale, action, &status);
 553     icu_check_status(status);
 554     YAZ_CHECK(tokenizer);
 555
 556     // attach text buffer to tokenizer
 557     icu_tokenizer_attach(tokenizer, src16, &status);
 558     icu_check_status(status);
 559     YAZ_CHECK(tokenizer->bi);
 560
 561     // perform work on tokens
 562     //printf("Tokens: ");
 563     while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
 564         icu_check_status(status);
 565
 566         // converting to UTF8
 567         icu_utf16_to_utf8(tkn8, tkn16, &status);
 568
 569         //printf("(%d)'%s' ", icu_tokenizer_token_id(tokenizer), tkn8->utf8);
 570
 571         //printf("token %d %d %d %d '%s'\n",
 572         //
 573         //       icu_tokenizer_token_start(tokenizer),
 574         //       icu_tokenizer_token_end(tokenizer),
 575         //       icu_tokenizer_token_length(tokenizer),
 576         //       tkn8->utf8);
 577     }
 578     //printf("\nTokens: %d\n", icu_tokenizer_token_count(tokenizer));
 579
 580
 581     if (count != icu_tokenizer_token_count(tokenizer)){
 582         success = 0;
 583         printf("\nTokenizer '%s:%c' Error: \n", locale, action);
 584         printf("Input:  '%s'\n", src8cstr);
 585         printf("Tokens: %d", icu_tokenizer_token_count(tokenizer));
 586         printf(", expected: %d\n", count);
 587     }
 588
 589     icu_tokenizer_destroy(tokenizer);
 590     icu_buf_utf16_destroy(src16);
 591     icu_buf_utf16_destroy(tkn16);
 592     icu_buf_utf8_destroy(tkn8);
 593
 594     return success;
 595 }
 596
 597
 598 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 599
 600 void test_icu_I18N_tokenizer(int argc, char **argv)
 601 {
 602
 603
 604     const char * en_str
 605         = "O Romeo, Romeo! wherefore art thou Romeo?";
 606
 607     YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2));
 608     YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7));
 609     YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16));
 610     YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41));
 611
 612
 613
 614     const char * da_str
 615         = "Blåbærtærte. Denne kage stammer fra Finland. "
 616         "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
 617
 618     YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
 619     YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
 620     YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
 621     YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
 622
 623 }
 624
 625
 626 void test_icu_I18N_chain(int argc, char **argv)
 627 {
 628     const char * en_str
 629         = "O Romeo, Romeo! wherefore art thou Romeo?";
 630
 631     UErrorCode status = U_ZERO_ERROR;
 632     struct icu_chain_step * step = 0;
 633     struct icu_chain * chain
 634         = icu_chain_create((uint8_t *) "en:sentence", (uint8_t *) "en");
 635 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */
 636 /*                                  (const uint8_t *) "[:Control:] Any-Remove", */
 637 /*                                  &status); */
 638 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, */
 639 /*                                  (const uint8_t *) "w", */
 640 /*                                  &status); */
 641 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */
 642 /*                                  (const uint8_t *) */
 643 /*                                  "[[:WhiteSpace:][:Punctuation:]] Any-Remove", */
 644 /*                                  &status); */
 645     step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 646                                  (const uint8_t *)"",
 647                                  &status);
 648     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 649                                  (const uint8_t *) "Lower",
 650                                  &status);
 651     step = icu_chain_insert_step(chain, ICU_chain_step_type_norm,
 652                                  (const uint8_t *)"",
 653                                  &status);
 654 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_sort, */
 655 /*                                  (const uint8_t *)"", */
 656 /*                                  &status); */
 657
 658
 659
 660
 661     YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));
 662
 663     while (icu_chain_next_token(chain, &status)){
 664         printf("token %d norm: '%s' display: '%s'\n",
 665                icu_chain_get_token_count(chain),
 666                icu_chain_get_norm(chain),
 667                icu_chain_get_display(chain));
 668     }
 669
 670     icu_chain_destroy(chain);
 671 }
 672
 673
 674
 675 #endif // HAVE_ICU
 676
 677 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 678
 679 int main(int argc, char **argv)
 680 {
 681
 682     YAZ_CHECK_INIT(argc, argv);
 683     YAZ_CHECK_LOG();
 684
 685 #ifdef HAVE_ICU
 686
 687     //test_icu_I18N_casemap_failures(argc, argv);
 688     test_icu_I18N_casemap(argc, argv);
 689     test_icu_I18N_sortmap(argc, argv);
 690     //test_icu_I18N_transliterator(argc, argv);
 691     test_icu_I18N_tokenizer(argc, argv);
 692     //test_icu_I18N_chain(argc, argv);
 693
 694 #else // HAVE_ICU
 695
 696     printf("ICU unit tests omitted.\n"
 697            "Please install libicu36-dev and icu-doc or similar\n");
 698     YAZ_CHECK(0 == 0);
 699
 700 #endif // HAVE_ICU
 701
 702     YAZ_CHECK_TERM;
 703 }
 704
 705
 706 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 707
 708
 709
 710 /*
 711  * Local variables:
 712  * c-basic-offset: 4
 713  * indent-tabs-mode: nil
 714  * End:
 715  * vim: shiftwidth=4 tabstop=8 expandtab
 716  */