From: Marc Cromme Date: Thu, 10 May 2007 11:53:47 +0000 (+0000) Subject: danish tokenization unit test added, counting error in tokenizer corrected X-Git-Tag: PAZPAR2.1.0.0~161 X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=2a2cceb0749876c6a9f41a69376efed77dbe31af;p=pazpar2-moved-to-github.git danish tokenization unit test added, counting error in tokenizer corrected --- diff --git a/src/icu_I18N.c b/src/icu_I18N.c index fa9bd82..af0ba0e 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.8 2007-05-09 14:01:21 marc Exp $ +/* $Id: icu_I18N.c,v 1.9 2007-05-10 11:53:47 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -418,6 +418,7 @@ struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, tokenizer->action = action; tokenizer->bi = 0; tokenizer->buf16 = 0; + tokenizer->token_count = 0; tokenizer->token_id = 0; tokenizer->token_start = 0; tokenizer->token_end = 0; @@ -486,7 +487,12 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, if (!tokenizer || !tokenizer->bi || !src16) return 0; + tokenizer->buf16 = src16; + tokenizer->token_count = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); @@ -503,6 +509,7 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, { int32_t tkn_start = 0; int32_t tkn_end = 0; + int32_t tkn_len = 0; if (!tokenizer || !tokenizer->bi @@ -530,23 +537,32 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, // copy out if everything is well if(U_FAILURE(*status)) return 0; - - tokenizer->token_id++; + + // everything OK, now update internal state + tkn_len = tkn_end - tkn_start; + + if (0 < tkn_len){ + tokenizer->token_count++; + tokenizer->token_id++; + } else { + tokenizer->token_id = 0; + } tokenizer->token_start = tkn_start; tokenizer->token_end = tkn_end; + // copying into token buffer if it exists if (tkn16){ - if (tkn16->utf16_cap < (tkn_end - tkn_start)) - icu_buf_utf16_resize(tkn16, (size_t) (tkn_end - tkn_start) * 2); + if (tkn16->utf16_cap < tkn_len) + icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], - (tkn_end - tkn_start)); + tkn_len); - tkn16->utf16_len = (tkn_end - tkn_start); + tkn16->utf16_len = tkn_len; } - return (tokenizer->token_end - tokenizer->token_start); + return tkn_len; } diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 71c4da1..151ae09 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.12 2007-05-10 10:29:58 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.13 2007-05-10 11:53:47 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -400,13 +400,14 @@ void test_icu_I18N_tokenizer(int argc, char **argv) - const char * fr_str - = "O Romeo, Romeo! wherefore art thou Romeo?"; + const char * da_str + = "Blåbærtærte. Denne kage stammer fra Finland. " + "Den er med blåbær, men alle sommerens forskellige bær kan bruges."; - YAZ_CHECK(test_icu_tokenizer("fr", 's', fr_str, 2)); - YAZ_CHECK(test_icu_tokenizer("fr", 'l', fr_str, 7)); - YAZ_CHECK(test_icu_tokenizer("fr", 'w', fr_str, 16)); - YAZ_CHECK(test_icu_tokenizer("fr", 'c', fr_str, 41)); + YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3)); + YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17)); + YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37)); + YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110)); }