From 5d6bbb98e223ed34bd25f7c6f7ca1ac44e9c2684 Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Thu, 10 May 2007 10:29:58 +0000 Subject: [PATCH 1/1] fixed tokenization counting error, added more english tokenization unit tests --- src/test_icu_I18N.c | 63 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 992922a..71c4da1 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.11 2007-05-09 14:01:21 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.12 2007-05-10 10:29:58 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -320,21 +320,17 @@ void test_icu_I18N_normmap(int argc, char **argv) } - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_tokenizer(int argc, char **argv) +int test_icu_tokenizer(const char * locale, char action, + const char * src8cstr, int count) { - - const char * src8cstr - = "Though I am not naturally honest, I am so sometimes by chance."; + int success = 1; UErrorCode status = U_ZERO_ERROR; struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0); struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0); - printf("Input: '%s'\n", src8cstr); + //printf("Input: '%s'\n", src8cstr); // transforming to UTF16 icu_utf16_from_utf8_cstr(src16, src8cstr, &status); @@ -342,7 +338,7 @@ void test_icu_I18N_tokenizer(int argc, char **argv) // set up tokenizer struct icu_tokenizer * tokenizer - = icu_tokenizer_create("en", 's', &status); + = icu_tokenizer_create(locale, action, &status); icu_check_status(status); YAZ_CHECK(tokenizer); @@ -352,29 +348,66 @@ void test_icu_I18N_tokenizer(int argc, char **argv) YAZ_CHECK(tokenizer->bi); // perform work on tokens - printf("Tokens: "); + //printf("Tokens: "); while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){ icu_check_status(status); // converting to UTF8 icu_utf16_to_utf8(tkn8, tkn16, &status); - printf("'%s' ", tkn8->utf8); + //printf("(%d)'%s' ", icu_tokenizer_token_id(tokenizer), tkn8->utf8); //printf("token %d %d %d %d '%s'\n", - // icu_tokenizer_token_id(tokenizer), + // // icu_tokenizer_token_start(tokenizer), // icu_tokenizer_token_end(tokenizer), // icu_tokenizer_token_length(tokenizer), // tkn8->utf8); } - printf(" (%d)(%d)\n", icu_tokenizer_token_id(tokenizer), - icu_tokenizer_token_count(tokenizer)); + //printf("\nTokens: %d\n", icu_tokenizer_token_count(tokenizer)); + + + if (count != icu_tokenizer_token_count(tokenizer)){ + success = 0; + printf("\nTokenizer '%s:%c' Error: \n", locale, action); + printf("Input: '%s'\n", src8cstr); + printf("Tokens: %d", icu_tokenizer_token_count(tokenizer)); + printf(", expected: %d\n", count); + } icu_tokenizer_destroy(tokenizer); icu_buf_utf16_destroy(src16); icu_buf_utf16_destroy(tkn16); icu_buf_utf8_destroy(tkn8); + + return success; +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_tokenizer(int argc, char **argv) +{ + + + const char * en_str + = "O Romeo, Romeo! wherefore art thou Romeo?"; + + YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2)); + YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7)); + YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16)); + YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41)); + + + + const char * fr_str + = "O Romeo, Romeo! wherefore art thou Romeo?"; + + YAZ_CHECK(test_icu_tokenizer("fr", 's', fr_str, 2)); + YAZ_CHECK(test_icu_tokenizer("fr", 'l', fr_str, 7)); + YAZ_CHECK(test_icu_tokenizer("fr", 'w', fr_str, 16)); + YAZ_CHECK(test_icu_tokenizer("fr", 'c', fr_str, 41)); + } -- 1.7.10.4