src/test_icu_I18N.c

   1 /* $Id: test_icu_I18N.c,v 1.12 2007-05-10 10:29:58 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  23
  24
  25 #if HAVE_CONFIG_H
  26 #include "cconfig.h"
  27 #endif
  28
  29 #define USE_TIMING 0
  30 #if USE_TIMING
  31 #include <yaz/timing.h>
  32 #endif
  33
  34 #include <yaz/test.h>
  35
  36
  37
  38 #ifdef HAVE_ICU
  39 #include "icu_I18N.h"
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 //#include <unicode/ustring.h>
  45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  46
  47
  48 #define MAX_KEY_SIZE 256
  49 struct icu_termmap
  50 {
  51     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  52     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  53 };
  54
  55
  56
  57 int icu_termmap_cmp(const void *vp1, const void *vp2)
  58 {
  59     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  60     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  61
  62     int cmp = 0;
  63
  64     cmp = strcmp((const char *)itmp1->sort_key,
  65                  (const char *)itmp2->sort_key);
  66     return cmp;
  67 };
  68
  69
  70
  71
  72 int test_icu_casemap(const char * locale, char action,
  73                      const char * src8cstr, const char * chk8cstr)
  74 {
  75     int success = 0;
  76     UErrorCode status = U_ZERO_ERROR;
  77
  78     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
  79     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
  80     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
  81     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
  82
  83
  84     int src8cstr_len = strlen(src8cstr);
  85     int chk8cstr_len = strlen(chk8cstr);
  86
  87     // converting to UTF16
  88     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
  89
  90     // perform case mapping
  91     icu_utf16_casemap(dest16, src16, locale, action, &status);
  92
  93     // converting to UTF8
  94     icu_utf16_to_utf8(dest8, dest16, &status);
  95
  96
  97
  98     // determine success
  99     if (dest8->utf8
 100         && (dest8->utf8_len == strlen(chk8cstr))
 101         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 102         success = 1;
 103     else
 104         success = 0;
 105
 106     // report failures
 107     if (!success){
 108         printf("\nERROR\n");
 109         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 110         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 111                locale, action, dest8->utf8, dest8->utf8_len);
 112         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 113     }
 114
 115     // clean the buffers
 116     icu_buf_utf8_destroy(src8);
 117     icu_buf_utf8_destroy(dest8);
 118     icu_buf_utf16_destroy(src16);
 119     icu_buf_utf16_destroy(dest16);
 120
 121
 122     return success;
 123 }
 124
 125
 126
 127 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 128
 129 void test_icu_I18N_casemap(int argc, char **argv)
 130 {
 131
 132     // Locale 'en'
 133
 134     // sucessful tests
 135     YAZ_CHECK(test_icu_casemap("en", 'l',
 136                                "A ReD fOx hunTS sQUirriLs",
 137                                "a red fox hunts squirrils"));
 138
 139     YAZ_CHECK(test_icu_casemap("en", 'u',
 140                                "A ReD fOx hunTS sQUirriLs",
 141                                "A RED FOX HUNTS SQUIRRILS"));
 142
 143     YAZ_CHECK(test_icu_casemap("en", 'f',
 144                                "A ReD fOx hunTS sQUirriLs",
 145                                "a red fox hunts squirrils"));
 146
 147     YAZ_CHECK(test_icu_casemap("en", 't',
 148                                "A ReD fOx hunTS sQUirriLs",
 149                                "A Red Fox Hunts Squirrils"));
 150
 151
 152     // Locale 'da'
 153
 154     // sucess expected
 155     YAZ_CHECK(test_icu_casemap("da", 'l',
 156                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 157                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 158
 159     YAZ_CHECK(test_icu_casemap("da", 'u',
 160                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 161                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 162
 163     YAZ_CHECK(test_icu_casemap("da", 'f',
 164                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 165                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 166
 167     YAZ_CHECK(test_icu_casemap("da", 't',
 168                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 169                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 170
 171     // Locale 'de'
 172
 173     // sucess expected
 174     YAZ_CHECK(test_icu_casemap("de", 'l',
 175                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 176                                "zwölf ärgerliche würste rollen über die straße"));
 177
 178     YAZ_CHECK(test_icu_casemap("de", 'u',
 179                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 180                                "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 181
 182     YAZ_CHECK(test_icu_casemap("de", 'f',
 183                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 184                                "zwölf ärgerliche würste rollen über die strasse"));
 185
 186     YAZ_CHECK(test_icu_casemap("de", 't',
 187                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 188                                "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 189
 190 }
 191
 192
 193 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 194
 195 int test_icu_sortmap(const char * locale, int src_list_len,
 196                      const char ** src_list, const char ** chk_list)
 197 {
 198     int success = 1;
 199
 200     UErrorCode status = U_ZERO_ERROR;
 201
 202     struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 203     struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 204
 205     int i;
 206
 207     struct icu_termmap * list[src_list_len];
 208
 209     UCollator *coll = ucol_open(locale, &status);
 210     icu_check_status(status);
 211
 212     if(U_FAILURE(status))
 213         return 0;
 214
 215     // assigning display terms and sort keys using buf 8 and buf16
 216     for( i = 0; i < src_list_len; i++)
 217         {
 218
 219             list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 220
 221             // copy display term
 222             strcpy(list[i]->disp_term, src_list[i]);
 223
 224             // transforming to UTF16
 225             icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 226             icu_check_status(status);
 227
 228             // computing sortkeys
 229             icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 230             icu_check_status(status);
 231
 232             // assigning sortkeys
 233             memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 234             //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 235             //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 236         }
 237
 238
 239     // do the sorting
 240     qsort(list, src_list_len,
 241           sizeof(struct icu_termmap *), icu_termmap_cmp);
 242
 243     // checking correct sorting
 244     for (i = 0; i < src_list_len; i++){
 245         if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 246             success = 0;
 247         }
 248     }
 249
 250     if(!success){
 251         printf("\nERROR\n");
 252         printf("Input str: '%s' : ", locale);
 253         for (i = 0; i < src_list_len; i++) {
 254             printf(" '%s'", list[i]->disp_term);
 255         }
 256         printf("\n");
 257         printf("ICU sort:  '%s' : ", locale);
 258         for (i = 0; i < src_list_len; i++) {
 259             printf(" '%s'", list[i]->disp_term);
 260             //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 261         }
 262         printf("\n");
 263         printf("Expected:  '%s' : ", locale);
 264         for (i = 0; i < src_list_len; i++) {
 265             printf(" '%s'", chk_list[i]);
 266         }
 267         printf("\n");
 268     }
 269
 270
 271     ucol_close(coll);
 272
 273     icu_buf_utf8_destroy(buf8);
 274     icu_buf_utf16_destroy(buf16);
 275
 276
 277
 278     return success;
 279 }
 280
 281
 282 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 283
 284 void test_icu_I18N_sortmap(int argc, char **argv)
 285 {
 286
 287     // sucessful tests
 288     size_t en_1_len = 6;
 289     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 290     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 291     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 292     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 293     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 294     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 295     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 296
 297     // sucessful tests
 298     size_t da_1_len = 6;
 299     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 300     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 301     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 302     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 303
 304     // sucessful tests
 305     size_t de_1_len = 9;
 306     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 307     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 308     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 309     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 310     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 311
 312 }
 313
 314
 315 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 316
 317 void test_icu_I18N_normmap(int argc, char **argv)
 318 {
 319
 320
 321 }
 322
 323 int test_icu_tokenizer(const char * locale, char action,
 324                      const char * src8cstr, int count)
 325 {
 326     int success = 1;
 327
 328     UErrorCode status = U_ZERO_ERROR;
 329     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 330     struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
 331     struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
 332
 333     //printf("Input:  '%s'\n", src8cstr);
 334
 335     // transforming to UTF16
 336     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 337     icu_check_status(status);
 338
 339     // set up tokenizer
 340     struct icu_tokenizer * tokenizer
 341         = icu_tokenizer_create(locale, action, &status);
 342     icu_check_status(status);
 343     YAZ_CHECK(tokenizer);
 344
 345     // attach text buffer to tokenizer
 346     icu_tokenizer_attach(tokenizer, src16, &status);
 347     icu_check_status(status);
 348     YAZ_CHECK(tokenizer->bi);
 349
 350     // perform work on tokens
 351     //printf("Tokens: ");
 352     while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
 353         icu_check_status(status);
 354
 355         // converting to UTF8
 356         icu_utf16_to_utf8(tkn8, tkn16, &status);
 357
 358         //printf("(%d)'%s' ", icu_tokenizer_token_id(tokenizer), tkn8->utf8);
 359
 360         //printf("token %d %d %d %d '%s'\n",
 361         //
 362         //       icu_tokenizer_token_start(tokenizer),
 363         //       icu_tokenizer_token_end(tokenizer),
 364         //       icu_tokenizer_token_length(tokenizer),
 365         //       tkn8->utf8);
 366     }
 367     //printf("\nTokens: %d\n", icu_tokenizer_token_count(tokenizer));
 368
 369
 370     if (count != icu_tokenizer_token_count(tokenizer)){
 371         success = 0;
 372         printf("\nTokenizer '%s:%c' Error: \n", locale, action);
 373         printf("Input:  '%s'\n", src8cstr);
 374         printf("Tokens: %d", icu_tokenizer_token_count(tokenizer));
 375         printf(", expected: %d\n", count);
 376     }
 377
 378     icu_tokenizer_destroy(tokenizer);
 379     icu_buf_utf16_destroy(src16);
 380     icu_buf_utf16_destroy(tkn16);
 381     icu_buf_utf8_destroy(tkn8);
 382
 383     return success;
 384 }
 385
 386
 387 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 388
 389 void test_icu_I18N_tokenizer(int argc, char **argv)
 390 {
 391
 392
 393     const char * en_str
 394         = "O Romeo, Romeo! wherefore art thou Romeo?";
 395
 396     YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2));
 397     YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7));
 398     YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16));
 399     YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41));
 400
 401
 402
 403     const char * fr_str
 404         = "O Romeo, Romeo! wherefore art thou Romeo?";
 405
 406     YAZ_CHECK(test_icu_tokenizer("fr", 's', fr_str, 2));
 407     YAZ_CHECK(test_icu_tokenizer("fr", 'l', fr_str, 7));
 408     YAZ_CHECK(test_icu_tokenizer("fr", 'w', fr_str, 16));
 409     YAZ_CHECK(test_icu_tokenizer("fr", 'c', fr_str, 41));
 410
 411 }
 412
 413
 414
 415
 416
 417 #endif // HAVE_ICU
 418
 419 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 420
 421 int main(int argc, char **argv)
 422 {
 423
 424     YAZ_CHECK_INIT(argc, argv);
 425     YAZ_CHECK_LOG();
 426
 427 #ifdef HAVE_ICU
 428
 429     //test_icu_I18N_casemap_failures(argc, argv);
 430     test_icu_I18N_casemap(argc, argv);
 431     test_icu_I18N_sortmap(argc, argv);
 432     test_icu_I18N_normmap(argc, argv);
 433     test_icu_I18N_tokenizer(argc, argv);
 434
 435 #else // HAVE_ICU
 436
 437     printf("ICU unit tests omitted.\n"
 438            "Please install libicu36-dev and icu-doc or similar\n");
 439     YAZ_CHECK(0 == 0);
 440
 441 #endif // HAVE_ICU
 442
 443     YAZ_CHECK_TERM;
 444 }
 445
 446
 447 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 448
 449
 450
 451 // CRAP to follow
 452 #if 0
 453
 454 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 455
 456 void test_icu_I18N_casemap_failures(int argc, char **argv)
 457 {
 458
 459     size_t buf_cap = 128;
 460     char buf[buf_cap];
 461     size_t dest8_len = 0;
 462     NMEM nmem = nmem_create();
 463     char * dest8 = 0;
 464
 465     const char * src8 =  "A ReD fOx hunTS sQUirriLs";
 466     //size_t src8_len = strlen(src8);
 467
 468     //printf("original string:   '%s' (%d)\n", src8, (int) src8_len);
 469
 470     // some calling error needs investigation
 471     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 472                         src8, "en", 't');
 473     YAZ_CHECK(0 == dest8_len);
 474     //printf("icu_casemap 'en:t' '%s' (%d)\n", dest8, (int) dest8_len);
 475
 476
 477     // attention: does not fail even if no locale 'xy_zz' defined
 478     // it seems to default to english locale
 479     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 480                         src8, "zz_abc", 'l');
 481     YAZ_CHECK(dest8_len);
 482     //printf("icu_casemap 'zz:l' '%s' (%d)\n", dest8, (int) dest8_len);
 483
 484
 485     // shall fail - no buf buffer defined
 486     dest8 = icu_casemap(nmem, 0, buf_cap, &dest8_len,
 487                         src8, "en", 'l');
 488     YAZ_CHECK(0 == dest8_len);
 489     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 490
 491     // shall fail - no buf_cap  defined
 492     dest8 = icu_casemap(nmem, buf, 0, &dest8_len,
 493                         src8, "en", 'l');
 494     YAZ_CHECK(0 == dest8_len);
 495     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 496
 497     // shall fail - no action 'x' defined
 498     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 499                         src8, "en", 'x');
 500     YAZ_CHECK(0 == dest8_len);
 501     //printf("icu_casemap 'en:x' '%s' (%d)\n", dest8, (int) dest8_len);
 502
 503     nmem_destroy(nmem);
 504 }
 505
 506
 507
 508 #endif
 509
 510
 511
 512 /*
 513  * Local variables:
 514  * c-basic-offset: 4
 515  * indent-tabs-mode: nil
 516  * End:
 517  * vim: shiftwidth=4 tabstop=8 expandtab
 518  */