ICU_chain_step_type_transliterate /* apply utf16 tokenization */
};
-#define USE_ITER 1
-
struct icu_chain_step
{
/* type and action object */
struct icu_tokenizer * tokenizer;
} u;
struct icu_chain_step * previous;
-#if USE_ITER
-#else
- /* temprary post-action utf16 buffer */
- struct icu_buf_utf16 * buf16;
- int more_tokens;
- int need_new_token;
-#endif
};
-
struct icu_chain
{
-#if USE_ITER
struct icu_iter *iter;
-#endif
-
char *locale;
int sort;
UCollator * coll;
-#if USE_ITER
-#else
- const char * src8cstr;
-
- /* number of tokens returned so far */
- int32_t token_count;
-#endif
-
/* utf8 output buffers */
struct icu_buf_utf8 * norm8;
-#if USE_ITER
-#else
- struct icu_buf_utf8 * display8;
- struct icu_buf_utf8 * sort8;
-
- /* utf16 source buffer */
- struct icu_buf_utf16 * src16;
-#endif
/* linked list of chain steps */
struct icu_chain_step * steps;
static struct icu_chain_step *icu_chain_step_create(
struct icu_chain * chain, enum icu_chain_step_type type,
const uint8_t * rule,
-#if USE_ITER
-#else
- struct icu_buf_utf16 * buf16,
-#endif
UErrorCode *status)
{
struct icu_chain_step * step = 0;
step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
step->type = type;
-#if USE_ITER
-#else
- step->buf16 = buf16;
-#endif
/* create auxilary objects */
switch (step->type)
{
break;
case ICU_chain_step_type_casemap:
icu_casemap_destroy(step->u.casemap);
-#if USE_ITER
-#else
- icu_buf_utf16_destroy(step->buf16);
-#endif
break;
case ICU_chain_step_type_transform:
case ICU_chain_step_type_transliterate:
icu_transform_destroy(step->u.transform);
-#if USE_ITER
-#else
- icu_buf_utf16_destroy(step->buf16);
-#endif
break;
case ICU_chain_step_type_tokenize:
icu_tokenizer_destroy(step->u.tokenizer);
-#if USE_ITER
-#else
- icu_buf_utf16_destroy(step->buf16);
-#endif
break;
default:
break;
*status = U_ZERO_ERROR;
-#if USE_ITER
chain->iter = 0;
-#endif
chain->locale = xstrdup(locale);
chain->sort = sort;
if (U_FAILURE(*status))
return 0;
-#if USE_ITER
-#else
- chain->token_count = 0;
- chain->src8cstr = 0;
-#endif
chain->norm8 = icu_buf_utf8_create(0);
-#if USE_ITER
-#else
- chain->display8 = icu_buf_utf8_create(0);
- chain->sort8 = icu_buf_utf8_create(0);
- chain->src16 = icu_buf_utf16_create(0);
-#endif
-
chain->steps = 0;
return chain;
ucol_close(chain->coll);
icu_buf_utf8_destroy(chain->norm8);
-#if USE_ITER
if (chain->iter)
icu_iter_destroy(chain->iter);
-#else
- icu_buf_utf8_destroy(chain->display8);
- icu_buf_utf8_destroy(chain->sort8);
- icu_buf_utf16_destroy(chain->src16);
-#endif
-
icu_chain_step_destroy(chain->steps);
xfree(chain->locale);
xfree(chain);
const uint8_t * rule, UErrorCode *status)
{
struct icu_chain_step * step = 0;
-#if USE_ITER
-#else
- struct icu_buf_utf16 * src16 = 0;
- struct icu_buf_utf16 * buf16 = 0;
-#endif
if (!chain || !type || !rule)
return 0;
-#if USE_ITER
-#else
- /* assign utf16 src buffers as needed */
- if (chain->steps && chain->steps->buf16)
- src16 = chain->steps->buf16;
- else if (chain->src16)
- src16 = chain->src16;
- else
- return 0;
-
- /* create utf16 destination buffers as needed, or */
- switch (type)
- {
- case ICU_chain_step_type_display:
- buf16 = src16;
- break;
- case ICU_chain_step_type_casemap:
- buf16 = icu_buf_utf16_create(0);
- break;
- case ICU_chain_step_type_transform:
- case ICU_chain_step_type_transliterate:
- buf16 = icu_buf_utf16_create(0);
- break;
- case ICU_chain_step_type_tokenize:
- buf16 = icu_buf_utf16_create(0);
- break;
- break;
- default:
- break;
- }
-#endif
/* create actual chain step with this buffer */
step = icu_chain_step_create(chain, type, rule,
-#if USE_ITER
-#else
- buf16,
-#endif
status);
step->previous = chain->steps;
return step;
}
-#if USE_ITER
-#else
-static int icu_chain_step_next_token(struct icu_chain * chain,
- struct icu_chain_step * step,
- UErrorCode *status)
-{
- struct icu_buf_utf16 * src16 = 0;
- int got_new_token = 0;
-
- if (!chain || !chain->src16 || !step || !step->more_tokens)
- return 0;
-
- /* assign utf16 src buffers as needed, advance in previous steps
- tokens until non-zero token met, and setting stop condition */
-
- if (step->previous)
- {
- src16 = step->previous->buf16;
- /* tokens might be killed in previous steps, therefore looping */
-
- while (step->need_new_token
- && step->previous->more_tokens
- && !got_new_token)
- got_new_token
- = icu_chain_step_next_token(chain, step->previous, status);
- }
- else
- { /* first step can only work once on chain->src16 input buffer */
- src16 = chain->src16;
- step->more_tokens = 0;
- got_new_token = 1;
- }
-
- if (!src16)
- return 0;
-
- /* stop if nothing to process */
- if (step->need_new_token && !got_new_token)
- {
- step->more_tokens = 0;
- return 0;
- }
-
- /* either an old token not finished yet, or a new token, thus
- perform the work, eventually put this steps output in
- step->buf16 or the chains UTF8 output buffers */
-
- switch (step->type)
- {
- case ICU_chain_step_type_display:
- icu_utf16_to_utf8(chain->display8, src16, status);
- break;
- case ICU_chain_step_type_casemap:
- icu_casemap_casemap(step->u.casemap,
- step->buf16, src16, status,
- chain->locale);
- break;
- case ICU_chain_step_type_transform:
- case ICU_chain_step_type_transliterate:
- icu_transform_trans(step->u.transform,
- step->buf16, src16, status);
- break;
- case ICU_chain_step_type_tokenize:
- /* attach to new src16 token only first time during splitting */
- if (step->need_new_token)
- {
- icu_tokenizer_attach(step->u.tokenizer, src16, status);
- step->need_new_token = 0;
- }
-
- /* splitting one src16 token into multiple buf16 tokens */
- step->more_tokens
- = icu_tokenizer_next_token(step->u.tokenizer,
- step->buf16, status);
-
- /* make sure to get new previous token if this one had been used up
- by recursive call to _same_ step */
-
- if (!step->more_tokens)
- {
- step->more_tokens = icu_chain_step_next_token(chain, step, status);
- return step->more_tokens; /* avoid one token count too much! */
- }
- break;
- default:
- return 0;
- break;
- }
-
- if (U_FAILURE(*status))
- return 0;
-
- /* if token disappered into thin air, tell caller */
- /* if (!step->buf16->utf16_len && !step->more_tokens) */
- /* return 0; */
-
- return 1;
-}
-#endif
-
struct icu_iter {
struct icu_chain *chain;
struct icu_buf_utf16 *last;
int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
UErrorCode *status)
{
-#if USE_ITER
if (chain->iter)
icu_iter_destroy(chain->iter);
chain->iter = icu_iter_create(chain, src8cstr);
return 1;
-#else
- struct icu_chain_step * stp = 0;
-
- if (!chain || !src8cstr)
- return 0;
-
- chain->src8cstr = src8cstr;
-
- stp = chain->steps;
-
- /* clear token count */
- chain->token_count = 0;
-
- /* clear all steps stop states */
- while (stp)
- {
- stp->more_tokens = 1;
- stp->need_new_token = 1;
- stp = stp->previous;
- }
-
- /* finally convert UTF8 to UTF16 string if needed */
- if (chain->steps || chain->sort)
- icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
-
- if (U_FAILURE(*status))
- return 0;
-
- return 1;
-#endif
}
int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
{
-#if USE_ITER
*status = U_ZERO_ERROR;
return icu_iter_next(chain->iter, chain->norm8);
-#else
- int got_token = 0;
-
- *status = U_ZERO_ERROR;
-
- if (!chain)
- return 0;
-
- /* special case with no steps - same as index type binary */
- if (!chain->steps)
- {
- if (chain->token_count)
- return 0;
- else
- {
- chain->token_count++;
-
- if (chain->sort)
- icu_sortkey8_from_utf16(chain->coll,
- chain->sort8, chain->steps->buf16,
- status);
- return chain->token_count;
- }
- }
- /* usual case, one or more icu chain steps existing */
- else
- {
- while (!got_token && chain->steps && chain->steps->more_tokens)
- got_token = icu_chain_step_next_token(chain, chain->steps, status);
-
- if (got_token)
- {
- chain->token_count++;
-
- icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
-
- if (chain->sort)
- icu_sortkey8_from_utf16(chain->coll,
- chain->sort8, chain->steps->buf16,
- status);
- return chain->token_count;
- }
- }
-
- return 0;
-#endif
}
int icu_chain_token_number(struct icu_chain * chain)
{
-#if USE_ITER
if (chain && chain->iter)
return chain->iter->token_count;
return 0;
-#else
- if (!chain)
- return 0;
-
- return chain->token_count;
-#endif
}
const char * icu_chain_token_display(struct icu_chain * chain)
{
-#if USE_ITER
if (chain->iter)
return icu_iter_get_display(chain->iter);
-#else
- if (chain->display8)
- return icu_buf_utf8_to_cstr(chain->display8);
-#endif
return 0;
}
const char * icu_chain_token_norm(struct icu_chain * chain)
{
-#if USE_ITER
- if (chain->norm8)
- return icu_buf_utf8_to_cstr(chain->norm8);
-#else
- if (!chain->steps)
- return chain->src8cstr;
-
if (chain->norm8)
return icu_buf_utf8_to_cstr(chain->norm8);
-#endif
return 0;
}
const char * icu_chain_token_sortkey(struct icu_chain * chain)
{
-#if USE_ITER
if (chain->iter)
return icu_iter_get_sortkey(chain->iter);
-#else
- if (chain->sort8)
- return icu_buf_utf8_to_cstr(chain->sort8);
-#endif
return 0;
}