1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
8 * \brief ICU tokenization - using ubrk_-functions from ICU
16 #include <yaz/xmalloc.h>
18 #include <yaz/icu_I18N.h>
27 #include <unicode/ustring.h> /* some more string fcns*/
28 #include <unicode/uchar.h> /* char names */
34 struct icu_buf_utf16 * buf16;
45 0 <= token_id <= token_count
49 static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
52 tokenizer->action = action;
54 tokenizer->buf16 = icu_buf_utf16_create(0);
55 tokenizer->token_count = 0;
56 tokenizer->token_id = 0;
57 tokenizer->token_start = 0;
58 tokenizer->token_end = 0;
62 struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
64 int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
65 UErrorCode status = U_ZERO_ERROR;
66 struct icu_tokenizer * tokenizer
67 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
70 icu_tokenizer_reset(tokenizer, old->action);
72 tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
73 if (U_SUCCESS(status))
78 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
81 struct icu_tokenizer *tokenizer
82 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
84 icu_tokenizer_reset(tokenizer, action);
85 switch (tokenizer->action)
89 tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
93 tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
97 tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
101 tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
105 tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
108 *status = U_UNSUPPORTED_ERROR;
113 /* ICU error stuff is a very funny business */
114 if (U_SUCCESS(*status))
117 /* freeing if failed */
118 icu_tokenizer_destroy(tokenizer);
122 void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
126 icu_buf_utf16_destroy(tokenizer->buf16);
128 ubrk_close(tokenizer->bi);
133 int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
134 struct icu_buf_utf16 *src16,
137 if (!tokenizer || !tokenizer->bi || !src16)
140 icu_buf_utf16_copy(tokenizer->buf16, src16);
142 tokenizer->token_count = 0;
143 tokenizer->token_id = 0;
144 tokenizer->token_start = 0;
145 tokenizer->token_end = 0;
147 ubrk_setText(tokenizer->bi,
148 tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
150 if (U_FAILURE(*status))
156 int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
157 struct icu_buf_utf16 *tkn16,
159 size_t *start, size_t *len)
161 int32_t tkn_start = 0;
165 if (!tokenizer || !tokenizer->bi
166 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
169 never change tokenizer->buf16 and keep always invariant
170 0 <= tokenizer->token_start
171 <= tokenizer->token_end
172 <= tokenizer->buf16->utf16_len
173 returns length of token
176 if (0 == tokenizer->token_end) /* first call */
177 tkn_start = ubrk_first(tokenizer->bi);
178 else /* successive calls */
179 tkn_start = tokenizer->token_end;
181 /* get next position */
182 tkn_end = ubrk_next(tokenizer->bi);
184 /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
185 if (UBRK_DONE == tkn_end)
186 tkn_end = tokenizer->buf16->utf16_len;
188 /* copy out if everything is well */
189 if (U_FAILURE(*status))
192 /* everything OK, now update internal state */
193 tkn_len = tkn_end - tkn_start;
197 tokenizer->token_count++;
198 tokenizer->token_id++;
201 tokenizer->token_id = 0;
203 tokenizer->token_start = tkn_start;
204 tokenizer->token_end = tkn_end;
207 *len = tkn_end - tkn_start;
209 /* copying into token buffer if it exists */
212 if (tkn16->utf16_cap < tkn_len)
213 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
215 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
218 tkn16->utf16_len = tkn_len;
224 int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
226 return tokenizer->token_count;
229 #endif /* YAZ_HAVE_ICU */
234 * c-file-style: "Stroustrup"
235 * indent-tabs-mode: nil
237 * vim: shiftwidth=4 tabstop=8 expandtab