1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
16 #include <yaz/options.h>
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h>
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25 #include <unicode/uclean.h>
28 #include <yaz/wrbuf.h>
29 #include <yaz/backtrace.h>
31 /* commando line and config parameters */
38 yaz_icu_chain_t chain;
43 void print_option_error(const struct config_t *p_config)
45 fprintf(stderr, "yaz-icu [options] [infile]\n"
47 " -c file XML configuration\n"
48 " -p a|c|l|t Print ICU info \n"
49 " -s Show sort normalization key\n"
50 " -o Show org positions\n"
51 " -x XML output instread of text\n"
54 "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
59 "Example ICU chain XML configuration file:\n"
60 "<icu_chain locale=\"en\">\n"
61 " <transform rule=\"[:Control:] Any-Remove\"/>\n"
62 " <tokenize rule=\"l\"/>\n"
63 " <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
64 " <casemap rule=\"l\"/>\n"
70 void read_params(int argc, char **argv, struct config_t *p_config)
75 /* set default parameters */
76 p_config->conffile[0] = 0;
77 p_config->print[0] = 0;
78 p_config->xmloutput = 0;
79 p_config->sortoutput = 0;
82 p_config->outfile = stdout;
83 p_config->org_output = 0;
85 /* set up command line parameters */
87 while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
92 strcpy(p_config->conffile, arg);
95 strcpy(p_config->print, arg);
98 p_config->sortoutput = 1;
101 p_config->xmloutput = 1;
104 p_config->org_output = 1;
107 if (p_config->infile)
109 fprintf(stderr, "yaz-icu: only one input file may be given\n");
110 print_option_error(p_config);
112 p_config->infile = fopen(arg, "r");
113 if (!p_config->infile)
115 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
116 arg, strerror(errno));
121 fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
122 print_option_error(p_config);
126 if (p_config->infile == 0)
127 p_config->infile = stdin;
129 if (!strlen(p_config->conffile) && !strlen(p_config->print))
130 print_option_error(p_config);
133 static void print_icu_converters(const struct config_t *p_config)
138 count = ucnv_countAvailable();
139 if (p_config->xmloutput)
140 fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
141 count, ucnv_getDefaultName());
144 fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
145 fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
146 ucnv_getDefaultName());
149 for (i = 0; i < count; i++)
151 if (p_config->xmloutput)
152 fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
153 ucnv_getAvailableName(i));
155 fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
158 if (p_config->xmloutput)
159 fprintf(p_config->outfile, "</converters>\n");
161 fprintf(p_config->outfile, "\n");
164 static void print_icu_transliterators(const struct config_t *p_config)
167 UEnumeration *en = utrans_openIDs(&status);
168 int32_t count = uenum_count(en, &status);
172 if (p_config->xmloutput)
173 fprintf(p_config->outfile, "<transliterators count=\"%d\">\n", count);
175 fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
177 while ((name = uenum_next(en, &length, &status)))
179 if (p_config->xmloutput)
180 fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
182 fprintf(p_config->outfile, "%s\n", name);
185 if (p_config->xmloutput)
186 fprintf(p_config->outfile, "</transliterators>\n");
189 fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
190 " Pattern Description\n"
191 " Ranges [a-z] The lower case letters a through z\n"
192 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
193 " String [abc{def}] chars a, b and c, and string 'def'\n"
194 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
195 " Categories [:Letter:] Posix General Category 'Letter'.\n"
197 " Combination Example\n"
198 " Union [[:Greek:] [:letter:]]\n"
199 " Intersection [[:Greek:] & [:letter:]]\n"
200 " Set Complement [[:Greek:] - [:letter:]]\n"
201 " Complement [^[:Greek:] [:letter:]]\n"
203 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
206 " [:Punctuation:] Any-Remove\n"
207 " [:Cased-Letter:] Any-Upper\n"
208 " [:Control:] Any-Remove\n"
209 " [:Decimal_Number:] Any-Remove\n"
210 " [:Final_Punctuation:] Any-Remove\n"
211 " [:Georgian:] Any-Upper\n"
212 " [:Katakana:] Any-Remove\n"
213 " [:Arabic:] Any-Remove\n"
214 " [:Punctuation:] Remove\n"
215 " [[:Punctuation:]-[.,]] Remove\n"
216 " [:Line_Separator:] Any-Remove\n"
217 " [:Math_Symbol:] Any-Remove\n"
218 " Lower; [:^Letter:] Remove (word tokenization)\n"
219 " [:^Number:] Remove (numeric tokenization)\n"
220 " [:^Katagana:] Remove (remove everything except Katagana)\n"
221 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
222 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
223 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
224 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
226 "see http://userguide.icu-project.org/transforms/general\n"
227 " http://www.unicode.org/reports/tr44/\n"
231 fprintf(p_config->outfile, "\n\n");
236 static void print_icu_xml_locales(const struct config_t *p_config)
240 UErrorCode status = U_ZERO_ERROR;
243 int32_t keyword_len = 0;
244 char keyword_str[128];
245 int32_t keyword_str_len = 0;
248 int32_t language_len = 0;
250 int32_t lang_str_len = 0;
253 int32_t script_len = 0;
254 char script_str[128];
255 int32_t script_str_len = 0;
258 int32_t location_len = 0;
259 char location_str[128];
260 int32_t location_str_len = 0;
263 int32_t variant_len = 0;
264 char variant_str[128];
265 int32_t variant_str_len = 0;
268 int32_t name_len = 0;
270 int32_t name_str_len = 0;
273 int32_t localname_len = 0;
274 char localname_str[128];
275 int32_t localname_str_len = 0;
277 count = uloc_countAvailable() ;
279 if (p_config->xmloutput)
281 fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
282 count, uloc_getDefault(), ucol_countAvailable());
286 fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
287 fprintf(p_config->outfile, "Default locale is: %s\n", uloc_getDefault());
290 for (i = 0; i < count; i++)
294 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
298 u_strToUTF8(keyword_str, 128, &keyword_str_len,
299 keyword, keyword_len,
304 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
308 u_strToUTF8(lang_str, 128, &lang_str_len,
309 language, language_len,
314 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
318 u_strToUTF8(script_str, 128, &script_str_len,
323 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
327 u_strToUTF8(location_str, 128, &location_str_len,
328 location, location_len,
332 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
336 u_strToUTF8(variant_str, 128, &variant_str_len,
337 variant, variant_len,
341 = uloc_getDisplayName(uloc_getAvailable(i), "en",
345 u_strToUTF8(name_str, 128, &name_str_len,
350 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
354 u_strToUTF8(localname_str, 128, &localname_str_len,
355 localname, localname_len,
359 if (p_config->xmloutput)
361 fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
362 if (strlen(lang_str))
363 fprintf(p_config->outfile, " language=\"%s\"", lang_str);
364 if (strlen(script_str))
365 fprintf(p_config->outfile, " script=\"%s\"", script_str);
366 if (strlen(location_str))
367 fprintf(p_config->outfile, " location=\"%s\"", location_str);
368 if (strlen(variant_str))
369 fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
370 if (strlen(name_str))
371 fprintf(p_config->outfile, " name=\"%s\"", name_str);
372 if (strlen(localname_str))
373 fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
374 fprintf(p_config->outfile, ">");
375 if (strlen(localname_str))
376 fprintf(p_config->outfile, "%s", localname_str);
377 fprintf(p_config->outfile, "</locale>\n");
379 else if (1 == p_config->xmloutput)
381 fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
382 fprintf(p_config->outfile, " | ");
383 if (strlen(name_str))
384 fprintf(p_config->outfile, "%s", name_str);
385 fprintf(p_config->outfile, " | ");
386 if (strlen(localname_str))
387 fprintf(p_config->outfile, "%s", localname_str);
388 fprintf(p_config->outfile, "\n");
391 fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
393 if (p_config->xmloutput)
394 fprintf(p_config->outfile, "</locales>\n");
396 fprintf(p_config->outfile, "\n");
398 if (U_FAILURE(status))
400 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
406 static void print_info(const struct config_t *p_config)
408 if (p_config->xmloutput)
409 fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
412 if ('c' == p_config->print[0])
413 print_icu_converters(p_config);
414 else if ('l' == p_config->print[0])
415 print_icu_xml_locales(p_config);
416 else if ('t' == p_config->print[0])
417 print_icu_transliterators(p_config);
419 print_icu_converters(p_config);
420 print_icu_xml_locales(p_config);
421 print_icu_transliterators(p_config);
424 if (p_config->xmloutput)
425 fprintf(p_config->outfile, "</icu>\n");
432 static void process_text_file(struct config_t *p_config)
437 xmlDoc *doc = xmlParseFile(p_config->conffile);
438 xmlNode *xml_node = xmlDocGetRootElement(doc);
440 long unsigned int token_count = 0;
441 long unsigned int line_count = 0;
443 UErrorCode status = U_ZERO_ERROR;
447 printf("Could not parse XML config file '%s' \n",
452 p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
454 if (!p_config->chain || !U_SUCCESS(status))
456 printf("Could not set up ICU chain from config file '%s' \n",
461 if (p_config->xmloutput)
462 fprintf(p_config->outfile,
463 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
467 /* read input lines for processing */
468 while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
470 WRBUF sw = wrbuf_alloc();
471 WRBUF cdata = wrbuf_alloc();
472 int success = icu_chain_assign_cstr(p_config->chain, line, &status);
475 while (success && icu_chain_next_token(p_config->chain, &status))
477 if (U_FAILURE(status))
482 const char *org_string = 0;
483 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
485 icu_chain_get_org_info2(p_config->chain, &start, &len,
488 wrbuf_puts_escaped(sw, sortkey);
490 if (p_config->xmloutput)
492 fprintf(p_config->outfile,
493 "<token id=\"%lu\" line=\"%lu\"",
494 token_count, line_count);
497 wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
498 fprintf(p_config->outfile, " norm=\"%s\"",
502 wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
503 fprintf(p_config->outfile, " display=\"%s\"",
506 if (p_config->sortoutput)
509 wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
510 fprintf(p_config->outfile, " sortkey=\"%s\"",
513 fprintf(p_config->outfile, "/>\n");
517 fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
520 icu_chain_token_norm(p_config->chain),
521 icu_chain_token_display(p_config->chain));
522 if (p_config->sortoutput)
524 fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
526 if (p_config->org_output)
528 fprintf(p_config->outfile, " %ld+%ld",
529 (long) start, (long) len);
530 fputc(' ', p_config->outfile);
531 fwrite(org_string, 1, start, p_config->outfile);
532 fputc('*', p_config->outfile);
533 fwrite(org_string + start, 1, len, p_config->outfile);
534 fputc('*', p_config->outfile);
535 fputs(org_string + start + len, p_config->outfile);
537 fprintf(p_config->outfile, "\n");
542 wrbuf_destroy(cdata);
545 if (p_config->xmloutput)
546 fprintf(p_config->outfile,
550 icu_chain_destroy(p_config->chain);
556 #endif /* YAZ_HAVE_ICU */
559 int main(int argc, char **argv)
562 struct config_t config;
564 yaz_enable_panic_backtrace(*argv);
565 read_params(argc, argv, &config);
567 if (config.conffile && strlen(config.conffile))
568 process_text_file(&config);
570 if (config.print && strlen(config.print))
574 #else /* YAZ_HAVE_ICU */
576 printf("ICU not available on your system.\n"
577 "Please install libicu-dev and icu-doc or similar, "
578 "re-configure and re-compile\n");
582 #endif /* YAZ_HAVE_ICU */
591 * c-file-style: "Stroustrup"
592 * indent-tabs-mode: nil
594 * vim: shiftwidth=4 tabstop=8 expandtab