int icu_transform_trans(struct icu_transform * transform,
struct icu_buf_utf16 * dest16,
- struct icu_buf_utf16 * src16,
+ const struct icu_buf_utf16 * src16,
UErrorCode *status);
struct icu_chain_step;
UErrorCode * status);
+struct icu_iter;
+struct icu_iter *icu_iter_create(struct icu_chain *chain,
+ const char *src8cstr);
+void icu_iter_destroy(struct icu_iter *iter);
+int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result);
+const char *icu_iter_get_sortkey(struct icu_iter *iter);
+const char *icu_iter_get_display(struct icu_iter *iter);
+
#endif /* ICU_I18NL_H */
/*
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
+#include <assert.h>
#include <unicode/ustring.h> /* some more string fcns*/
#include <unicode/uchar.h> /* char names */
return 1;
}
+struct icu_iter {
+ struct icu_chain *chain;
+ struct icu_buf_utf16 *next;
+ UErrorCode status;
+ struct icu_buf_utf8 *display;
+ struct icu_buf_utf8 *sort8;
+};
+
+static void utf16_print(struct icu_buf_utf16 *src16)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ const char *p;
+ struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
+ icu_utf16_to_utf8(dst8, src16, &status);
+
+ assert(status != 1234);
+ if (U_FAILURE(status))
+ {
+ printf("utf8:failure\n");
+ }
+ else
+ {
+ p = icu_buf_utf8_to_cstr(dst8);
+ printf("utf8:%s\n", p);
+ }
+ icu_buf_utf8_destroy(dst8);
+}
+
+struct icu_buf_utf16 *icu_iter_invoke(struct icu_iter *iter,
+ struct icu_chain_step *step,
+ struct icu_buf_utf16 *src)
+{
+ if (!step)
+ return src;
+ else
+ {
+ struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
+
+ switch (step->type)
+ {
+ case ICU_chain_step_type_casemap:
+ if (dst)
+ {
+ struct icu_buf_utf16 *src = dst;
+
+ dst = icu_buf_utf16_create(0);
+ icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
+ iter->chain->locale);
+ icu_buf_utf16_destroy(src);
+ }
+ break;
+ case ICU_chain_step_type_tokenize:
+ if (dst)
+ {
+ struct icu_buf_utf16 *src = dst;
+
+ icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
+ icu_buf_utf16_destroy(src);
+ }
+ dst = icu_buf_utf16_create(0);
+ iter->status = U_ZERO_ERROR;
+ if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+ {
+ icu_buf_utf16_destroy(dst);
+ dst = 0;
+ }
+ break;
+ case ICU_chain_step_type_transform:
+ case ICU_chain_step_type_transliterate:
+ if (dst)
+ {
+ struct icu_buf_utf16 *src = dst;
+ dst = icu_buf_utf16_create(0);
+ icu_transform_trans(step->u.transform, dst, src, &iter->status);
+ icu_buf_utf16_destroy(src);
+ }
+ break;
+ case ICU_chain_step_type_display:
+ if (dst)
+ icu_utf16_to_utf8(iter->display, dst, &iter->status);
+ break;
+ default:
+ assert(0);
+ }
+ return dst;
+ }
+}
+
+struct icu_iter *icu_iter_create(struct icu_chain *chain,
+ const char *src8cstr)
+{
+ if (!src8cstr)
+ return 0;
+ else
+ {
+ struct icu_buf_utf16 *src16 = icu_buf_utf16_create(0);
+ struct icu_iter *iter = xmalloc(sizeof(*iter));
+ iter->chain = chain;
+ iter->status = U_ZERO_ERROR;
+ iter->display = icu_buf_utf8_create(0);
+ iter->sort8 = icu_buf_utf8_create(0);
+
+ icu_utf16_from_utf8_cstr(src16, src8cstr, &iter->status);
+ iter->next = icu_iter_invoke(iter, chain->steps, src16);
+ return iter;
+ }
+}
+
+void icu_iter_destroy(struct icu_iter *iter)
+{
+ if (iter)
+ {
+ icu_buf_utf8_destroy(iter->display);
+ icu_buf_utf8_destroy(iter->sort8);
+ xfree(iter);
+ }
+}
+
+int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result)
+{
+ struct icu_buf_utf16 *last = iter->next;
+ if (!last)
+ return 0;
+ else
+ {
+ if (iter->chain->sort)
+ {
+ icu_sortkey8_from_utf16(iter->chain->coll,
+ iter->sort8, last,
+ &iter->status);
+ }
+ icu_utf16_to_utf8(result, last, &iter->status);
+ iter->next = icu_iter_invoke(iter, iter->chain->steps, 0);
+ icu_buf_utf16_destroy(last);
+ return 1;
+ }
+}
+
+const char *icu_iter_get_sortkey(struct icu_iter *iter)
+{
+ return icu_buf_utf8_to_cstr(iter->sort8);
+}
+
+const char *icu_iter_get_display(struct icu_iter *iter)
+{
+ return icu_buf_utf8_to_cstr(iter->display);
+}
+
int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
UErrorCode *status)
{
tokenizer->action = action;
tokenizer->bi = 0;
- tokenizer->buf16 = 0;
+ tokenizer->buf16 = icu_buf_utf16_create(0);
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
{
if (tokenizer)
{
+ icu_buf_utf16_destroy(tokenizer->buf16);
if (tokenizer->bi)
ubrk_close(tokenizer->bi);
xfree(tokenizer);
if (!tokenizer || !tokenizer->bi || !src16)
return 0;
- tokenizer->buf16 = src16;
+ icu_buf_utf16_copy(tokenizer->buf16, src16);
+
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
tokenizer->token_end = 0;
- ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+ ubrk_setText(tokenizer->bi,
+ tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
if (U_FAILURE(*status))
return 0;
int icu_transform_trans(struct icu_transform * transform,
struct icu_buf_utf16 * dest16,
- struct icu_buf_utf16 * src16,
+ const struct icu_buf_utf16 * src16,
UErrorCode *status)
{
if (!transform || !transform->trans
utrans_transUChars (transform->trans,
dest16->utf16, &(dest16->utf16_len),
dest16->utf16_cap,
- 0, &(src16->utf16_len), status);
+ 0, &(dest16->utf16_len), status);
if (U_FAILURE(*status))
icu_buf_utf16_clear(dest16);
icu_chain_destroy(chain);
}
+void test_icu_iter1(void)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ struct icu_chain * chain = 0;
+ xmlNode *xml_node;
+ struct icu_iter *iter;
+ struct icu_buf_utf8 *token;
+
+ const char * xml_str = "<icu locale=\"en\">"
+ "<tokenize rule=\"w\"/>"
+ "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
+ "</icu>";
+
+ xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
+ YAZ_CHECK(doc);
+ if (!doc)
+ return;
+ xml_node = xmlDocGetRootElement(doc);
+ YAZ_CHECK(xml_node);
+ if (!xml_node)
+ return ;
+
+ chain = icu_chain_xml_config(xml_node, 0, &status);
+
+ xmlFreeDoc(doc);
+ YAZ_CHECK(chain);
+
+ iter = icu_iter_create(chain, "a string with 15 tokens and 8 displays");
+ YAZ_CHECK(iter);
+ if (!iter)
+ return;
+#if 1
+ token = icu_buf_utf8_create(0);
+ while (icu_iter_next(iter, token))
+ {
+ printf("[%.*s]", (int) token->utf8_len, token->utf8);
+ }
+ icu_buf_utf8_destroy(token);
+#endif
+
+ icu_iter_destroy(iter);
+ icu_chain_destroy(chain);
+}
+
+
+void test_icu_iter2(void)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ struct icu_chain * chain = 0;
+ xmlNode *xml_node;
+ struct icu_iter *iter;
+ struct icu_buf_utf8 *token;
+
+ const char * xml_str = "<icu locale=\"en\">"
+ "<transform rule=\"[:Control:] Any-Remove\"/>"
+ "<tokenize rule=\"l\"/>"
+ "<tokenize rule=\"w\"/>"
+ "<transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
+ "<display/>"
+ "<casemap rule=\"l\"/>"
+ "</icu>";
+
+ xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
+ YAZ_CHECK(doc);
+ if (!doc)
+ return;
+ xml_node = xmlDocGetRootElement(doc);
+ YAZ_CHECK(xml_node);
+ if (!xml_node)
+ return ;
+
+ chain = icu_chain_xml_config(xml_node, 0, &status);
+
+ xmlFreeDoc(doc);
+ YAZ_CHECK(chain);
+
+ iter = icu_iter_create(chain, "Adobe Acrobat Reader, 1991-1999.");
+ YAZ_CHECK(iter);
+ if (!iter)
+ return;
+ token = icu_buf_utf8_create(0);
+ while (icu_iter_next(iter, token))
+ {
+ printf("[%.*s]", (int) token->utf8_len, token->utf8);
+ }
+ icu_buf_utf8_destroy(token);
+
+ icu_iter_destroy(iter);
+ icu_chain_destroy(chain);
+}
+
#endif /* YAZ_HAVE_ICU */
int main(int argc, char **argv)
test_icu_I18N_chain(argc, argv);
test_chain_empty_token();
test_chain_empty_chain();
+ test_icu_iter1();
+ test_icu_iter2();
+
test_bug_1140();
#else /* YAZ_HAVE_ICU */