1 /* $Id: icu_I18N.h,v 1.17 2007-05-25 13:27:21 marc Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
29 #include <libxml/parser.h>
30 #include <libxml/tree.h>
32 #include <unicode/utypes.h> /* Basic ICU data types */
33 #include <unicode/uchar.h> /* char names */
35 //#include <unicode/ustdio.h>
36 #include <unicode/ucol.h>
37 //#include <unicode/ucnv.h> /* C Converter API */
38 //#include <unicode/ustring.h> /* some more string fcns*/
39 //#include <unicode/uloc.h>
40 #include <unicode/ubrk.h>
41 //#include <unicode/unistr.h>
42 #include <unicode/utrans.h>
46 // declared structs and functions
48 int icu_check_status (UErrorCode status);
57 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
58 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
60 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
61 struct icu_buf_utf16 * src16);
62 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
73 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
74 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
76 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
79 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
80 struct icu_buf_utf8 * src8,
83 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
84 const char * src8cstr,
88 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
89 struct icu_buf_utf16 * src16,
98 struct icu_casemap * icu_casemap_create(const char *locale, char action,
101 void icu_casemap_destroy(struct icu_casemap * casemap);
103 int icu_casemap_casemap(struct icu_casemap * casemap,
104 struct icu_buf_utf16 * dest16,
105 struct icu_buf_utf16 * src16,
108 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
109 struct icu_buf_utf16 * src16,
110 const char *locale, char action,
113 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
114 struct icu_buf_utf8 * dest8,
115 struct icu_buf_utf16 * src16,
116 UErrorCode * status);
123 struct icu_buf_utf16 * buf16;
128 // keep always invariant
131 // <= buf16->utf16_len
133 // 0 <= token_id <= token_count
136 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
139 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
141 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
142 struct icu_buf_utf16 * src16, UErrorCode *status);
144 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
145 struct icu_buf_utf16 * tkn16,
148 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
149 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
150 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
151 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
152 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
156 struct icu_normalizer
159 struct icu_buf_utf16 * rules16;
160 UParseError parse_error[256];
161 UTransliterator * trans;
164 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
168 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
170 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
171 struct icu_buf_utf16 * dest16,
172 struct icu_buf_utf16 * src16,
187 enum icu_chain_step_type {
188 ICU_chain_step_type_none, //
189 ICU_chain_step_type_display, // convert to utf8 display format
190 ICU_chain_step_type_index, // convert to utf8 index format
191 ICU_chain_step_type_sortkey, // convert to utf8 sortkey format
192 ICU_chain_step_type_casemap, // apply utf16 charmap
193 ICU_chain_step_type_normalize, // apply utf16 normalization
194 ICU_chain_step_type_tokenize // apply utf16 tokenization
199 struct icu_chain_step
201 // type and action object
202 enum icu_chain_step_type type;
204 struct icu_casemap * casemap;
205 struct icu_normalizer * normalizer;
206 struct icu_tokenizer * tokenizer;
208 // temprary post-action utf16 buffer
209 struct icu_buf_utf16 * buf16;
210 struct icu_chain_step * previous;
218 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
219 enum icu_chain_step_type type,
220 const uint8_t * rule,
221 struct icu_buf_utf16 * buf16,
225 void icu_chain_step_destroy(struct icu_chain_step * step);
230 uint8_t identifier[128];
233 // number of tokens returned so far
236 // utf8 output buffers
237 struct icu_buf_utf8 * display8;
238 struct icu_buf_utf8 * norm8;
239 struct icu_buf_utf8 * sort8;
241 // utf16 source buffer
242 struct icu_buf_utf16 * src16;
244 // linked list of chain steps
245 struct icu_chain_step * steps;
248 struct icu_chain * icu_chain_create(const uint8_t * identifier,
249 const uint8_t * locale);
251 void icu_chain_destroy(struct icu_chain * chain);
253 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
254 UErrorCode * status);
257 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
258 enum icu_chain_step_type type,
259 const uint8_t * rule,
263 int icu_chain_step_next_token(struct icu_chain * chain,
264 struct icu_chain_step * step,
267 int icu_chain_assign_cstr(struct icu_chain * chain,
268 const char * src8cstr,
271 int icu_chain_next_token(struct icu_chain * chain,
274 int icu_chain_get_token_count(struct icu_chain * chain);
276 const char * icu_chain_get_display(struct icu_chain * chain);
278 const char * icu_chain_get_norm(struct icu_chain * chain);
280 const char * icu_chain_get_sort(struct icu_chain * chain);
287 #endif // ICU_I18NL_H