Kouhei Sutou 2018-11-05 18:36:05 +0900 (Mon, 05 Nov 2018) Revision: f271a882824af64e03d0fbcd9ea0568ef8cda806 https://github.com/groonga/groonga/commit/f271a882824af64e03d0fbcd9ea0568ef8cda806 Message: NormalizerNFKC100: add unify_to_romaji option Added files: test/command/suite/normalizers/nfkc100/unify_to_romaji.expected test/command/suite/normalizers/nfkc100/unify_to_romaji.test Modified files: lib/grn_nfkc.h lib/nfkc.c lib/normalizer.c Modified: lib/grn_nfkc.h (+1 -0) =================================================================== --- lib/grn_nfkc.h 2018-11-05 17:13:19 +0900 (f4af36985) +++ lib/grn_nfkc.h 2018-11-05 18:36:05 +0900 (b4001a82a) @@ -45,6 +45,7 @@ typedef struct { grn_bool unify_middle_dot; grn_bool unify_katakana_v_sounds; grn_bool unify_katakana_bu_sound; + grn_bool unify_to_romaji; } grn_nfkc_normalize_options; const char *grn_nfkc_decompose(const unsigned char *utf8); Modified: lib/nfkc.c (+7 -0) =================================================================== --- lib/nfkc.c 2018-11-05 17:13:19 +0900 (29c56082a) +++ lib/nfkc.c 2018-11-05 18:36:05 +0900 (fff5a3bb8) @@ -63,6 +63,7 @@ grn_nfkc_normalize_options_init(grn_ctx *ctx, options->unify_middle_dot = GRN_FALSE; options->unify_katakana_v_sounds = GRN_FALSE; options->unify_katakana_bu_sound = GRN_FALSE; + options->unify_to_romaji = GRN_FALSE; } void @@ -154,6 +155,12 @@ grn_nfkc_normalize_options_apply(grn_ctx *ctx, raw_options, i, options->unify_katakana_bu_sound); + } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_to_romaji")) { + options->unify_to_romaji = + grn_vector_get_element_bool(ctx, + raw_options, + i, + options->unify_to_romaji); } } GRN_OPTION_VALUES_EACH_END(); Modified: lib/normalizer.c (+297 -25) =================================================================== --- lib/normalizer.c 2018-11-05 17:13:19 +0900 (9ff2b7306) +++ lib/normalizer.c 2018-11-05 18:36:05 +0900 (d1428a181) @@ -616,6 +616,7 @@ grn_nfkc_normalize_context_init(grn_ctx *ctx, "[normalize][nfkc] failed to allocate checks space"); return; } + context->checks[0] = 0; } context->c = context->checks; @@ -1184,6 +1185,227 @@ grn_nfkc_normalize_unify_katakana_bu_sound(const unsigned char *utf8_char, return GRN_FALSE; } +grn_inline static grn_bool +grn_nfkc_normalize_unify_to_romaji(grn_ctx *ctx, + const unsigned char *unifying, + grn_char_type char_type, + char *romaji) +{ + static char aiueo[] = "aiueo"; + static char auo[] = "auo"; + static char aaieo[] = "aaieo"; + size_t n_romajis = 0; + + if (!(char_type == GRN_CHAR_HIRAGANA || + char_type == GRN_CHAR_KATAKANA)) { + return n_romajis; + } + + switch (unifying[0]) { + case 0xe3 : + switch (unifying[1]) { + case 0x81 : + if (0x81 <= unifying[2] && unifying[2] <= 0x8a) { + /* U+3042 HIRAGANA LETTER SMALL A .. + * U+304A HIRAGANA LETTER O */ + if ((unifying[2] % 2) == 1) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = aiueo[(unifying[2] - 0x81) / 2]; + } else if (0x8b <= unifying[2] && unifying[2] <= 0x94) { + /* U+304B HIRAGANA LETTER KA .. + * U+3054 HIRAGANA LETTER GO */ + const char *gk = "gk"; + romaji[n_romajis++] = gk[unifying[2] % 2]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0x8b) / 2]; + } else if (0x95 <= unifying[2] && unifying[2] <= 0x9e) { + /* U+3055 HIRAGANA LETTER SA .. + * U+305E HIRAGANA LETTER ZO */ + const char *zs = "zs"; + romaji[n_romajis++] = zs[unifying[2] % 2]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0x95) / 2]; + } else if (0x9f <= unifying[2] && unifying[2] <= 0xa9) { + /* U+305F HIRAGANA LETTER TA .. + * U+3069 HIRAGANA LETTER DO */ + const char *tdtdttdtdtd = "tdtdttdtdtd"; + const char *aaiiuuueeoo = "aaiiuuueeoo"; + if (unifying[2] == 0xa3) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = tdtdttdtdtd[unifying[2] - 0x9f]; + romaji[n_romajis++] = aaiiuuueeoo[unifying[2] - 0x9f]; + } else if (0xaa <= unifying[2] && unifying[2] <= 0xae) { + /* U+306A HIRAGANA LETTER NA .. + * U+306E HIRAGANA LETTER NO */ + romaji[n_romajis++] = 'n'; + romaji[n_romajis++] = aiueo[(unifying[2] - 0xaa)]; + } else if (0xaf <= unifying[2] && unifying[2] <= 0xbd) { + /* U+306F HIRAGANA LETTER HA .. + * U+307D HIRAGANA LETTER PO */ + const char *phb = "phb"; + romaji[n_romajis++] = phb[unifying[2] % 3]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0xaf) / 3]; + } else if (0xbe <= unifying[2] && unifying[2] <= 0xbf) { + /* U+307E HIRAGANA LETTER MA .. + * U+307F HIRAGANA LETTER MI */ + romaji[n_romajis++] = 'm'; + romaji[n_romajis++] = aiueo[(unifying[2] - 0xbe)]; + } + break; + case 0x82 : + if (0x80 <= unifying[2] && unifying[2] <= 0x82) { + /* U+3080 HIRAGANA LETTER MU .. + * U+3082 HIRAGANA LETTER MO */ + romaji[n_romajis++] = 'm'; + romaji[n_romajis++] = aiueo[(unifying[2] - 0x80) + 2]; + } else if (0x83 <= unifying[2] && unifying[2] <= 0x88) { + /* U+3083 HIRAGANA LETTER SMALL YA .. + * U+3088 HIRAGANA LETTER YO */ + if ((unifying[2] % 2) == 1) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = 'y'; + romaji[n_romajis++] = auo[(unifying[2] - 0x83) / 2]; + } else if (0x89 <= unifying[2] && unifying[2] <= 0x8d) { + /* U+3089 HIRAGANA LETTER RA .. + * U+308D HIRAGANA LETTER RO */ + romaji[n_romajis++] = 'r'; + romaji[n_romajis++] = aiueo[unifying[2] - 0x89]; + } else if (0x8e <= unifying[2] && unifying[2] <= 0x92) { + /* U+308E HIRAGANA LETTER SMALL WA .. + * U+3092 HIRAGANA LETTER WO */ + if (unifying[2] == 0x8e) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = 'w'; + romaji[n_romajis++] = aaieo[unifying[2] - 0x8e]; + } else if (unifying[2] == 0x93) { + /* U+3093 HIRAGANA LETTER N */ + romaji[n_romajis++] = 'n'; + romaji[n_romajis++] = 'n'; + } else if (unifying[2] == 0x94) { + /* U+3094 HIRAGANA LETTER VU */ + romaji[n_romajis++] = 'v'; + romaji[n_romajis++] = 'u'; + } else if (unifying[2] == 0x95) { + /* U+3095 HIRAGANA LETTER SMALL KA */ + romaji[n_romajis++] = 'x'; + romaji[n_romajis++] = 'k'; + romaji[n_romajis++] = 'a'; + } else if (unifying[2] == 0x96) { + /* U+3096 HIRAGANA LETTER SMALL KE */ + romaji[n_romajis++] = 'x'; + romaji[n_romajis++] = 'k'; + romaji[n_romajis++] = 'e'; + } else if (0xa1 <= unifying[2] && unifying[2] <= 0xaa) { + /* U+30A1 KATAKANA LETTER SMALL A .. + * U+30AA KATAKANA LETTER O */ + if ((unifying[2] % 2) == 1) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = aiueo[(unifying[2] - 0xa1) / 2]; + } else if (0xab <= unifying[2] && unifying[2] <= 0xb4) { + /* U+30AB KATAKANA LETTER KA .. + * U+30B4 KATAKANA LETTER GO */ + const char *gk = "gk"; + romaji[n_romajis++] = gk[unifying[2] % 2]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0xab) / 2]; + } else if (0xb5 <= unifying[2] && unifying[2] <= 0xbe) { + /* U+30B5 KATAKANA LETTER SA .. + * U+30BE KATAKANA LETTER ZO */ + const char *zs = "zs"; + romaji[n_romajis++] = zs[unifying[2] % 2]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0xb5) / 2]; + } else if (unifying[2] == 0xbf) { + /* U+30BF KATAKANA LETTER TA */ + romaji[n_romajis++] = 't'; + romaji[n_romajis++] = 'a'; + } + break; + case 0x83 : + if (0x80 <= unifying[2] && unifying[2] <= 0x89) { + /* U+30C0 KATAKANA LETTER DA .. + * U+30C9 KATAKANA LETTER DO */ + const char *aiiuuueeoo = "aiiuuueeoo"; + if (unifying[2] == 0x83) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = 't'; + romaji[n_romajis++] = aiiuuueeoo[unifying[2] - 0x80]; + } else if (0x8a <= unifying[2] && unifying[2] <= 0x8e) { + /* U+30CA KATAKANA LETTER NA .. + * U+30CE KATAKANA LETTER NO */ + romaji[n_romajis++] = 'n'; + romaji[n_romajis++] = aiueo[unifying[2] - 0x8a]; + } else if (0x8f <= unifying[2] && unifying[2] <= 0x9d) { + /* U+30CF KATAKANA LETTER HA .. + * U+30DD KATAKANA LETTER PO */ + const char *bph = "bph"; + romaji[n_romajis++] = bph[unifying[2] % 3]; + romaji[n_romajis++] = aiueo[(unifying[2] - 0x8f) / 3]; + } else if (0x9e <= unifying[2] && unifying[2] <= 0xa2) { + /* U+30DE KATAKANA LETTER MA .. + * U+30E2 KATAKANA LETTER MO */ + romaji[n_romajis++] = 'm'; + romaji[n_romajis++] = aiueo[unifying[2] - 0x9e]; + } else if (0xa3 <= unifying[2] && unifying[2] <= 0xa8) { + /* U+30E3 KATAKANA LETTER SMALL YA .. + * U+30E8 KATAKANA LETTER YO */ + if ((unifying[2] % 2) == 1) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = 'y'; + romaji[n_romajis++] = auo[(unifying[2] - 0xa3) / 2]; + } else if (0xa9 <= unifying[2] && unifying[2] <= 0xad) { + /* U+30E9 KATAKANA LETTER RA .. + * U+30ED KATAKANA LETTER RO */ + romaji[n_romajis++] = 'r'; + romaji[n_romajis++] = aiueo[unifying[2] - 0xa9]; + } else if (0xae <= unifying[2] && unifying[2] <= 0xb2) { + /* U+30EE KATAKANA LETTER SMALL WA .. + * U+30F2 KATAKANA LETTER WO */ + if (unifying[2] == 0xae) { /* SMALL */ + romaji[n_romajis++] = 'x'; + } + romaji[n_romajis++] = 'w'; + romaji[n_romajis++] = aaieo[unifying[2] - 0xae]; + } else if (unifying[2] == 0xb3) { + /* U+30F3 KATAKANA LETTER N */ + romaji[n_romajis++] = 'n'; + romaji[n_romajis++] = 'n'; + } else if (unifying[2] == 0xb4) { + /* U+30F4 KATAKANA LETTER VU */ + romaji[n_romajis++] = 'v'; + romaji[n_romajis++] = 'u'; + } else if (unifying[2] == 0xb5) { + /* U+30F5 KATAKANA LETTER SMALL KA */ + romaji[n_romajis++] = 'x'; + romaji[n_romajis++] = 'k'; + romaji[n_romajis++] = 'a'; + } else if (unifying[2] == 0xb6) { + /* U+30F6 KATAKANA LETTER SMALL KE */ + romaji[n_romajis++] = 'x'; + romaji[n_romajis++] = 'k'; + romaji[n_romajis++] = 'e'; + } else if (0xb7 <= unifying[2] && unifying[2] <= 0xba) { + /* U+30F7 KATAKANA LETTER VA .. + * U+30FA KATAKANA LETTER VO */ + static char aieo[] = "aieo"; + romaji[n_romajis++] = 'v'; + romaji[n_romajis++] = aieo[unifying[2] - 0xb7]; + } + break; + default : + break; + } + break; + default : + break; + } + + return n_romajis; +} + static void grn_nfkc_normalize_unify(grn_ctx *ctx, grn_nfkc_normalize_data *data) @@ -1203,7 +1425,8 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, data->options->unify_middle_dot || data->options->unify_katakana_v_sounds || data->options->unify_katakana_bu_sound || - data->options->unify_hyphen)) { + data->options->unify_hyphen || + data->options->unify_to_romaji)) { return; } @@ -1343,32 +1566,81 @@ grn_nfkc_normalize_unify(grn_ctx *ctx, unify.c[0] += data->context.checks[i_byte]; } } else { - if (unify.d + unified_char_length >= unify.dest_end) { - grn_nfkc_normalize_context_expand(ctx, - &unify, - unified_char_length, - "[unify]"); - if (ctx->rc != GRN_SUCCESS) { - goto exit; - } + char romaji[3]; + size_t n_romajis = 0; + + if (data->options->unify_to_romaji) { + n_romajis = grn_nfkc_normalize_unify_to_romaji(ctx, + unifying, + char_type, + romaji); } - grn_memcpy(unify.d, unifying, unified_char_length); - unify.d_ = unify.d; - unify.d += unified_char_length; - unify.n_characters++; - if (unify.t) { - *(unify.t++) = char_type; - } - if (unify.c) { - size_t i; - *(unify.c++) += data->context.checks[i_byte]; - for (i = 1; i < unified_char_length; i++) { - *(unify.c++) = 0; + + if (n_romajis == 0) { + if (unify.d + unified_char_length >= unify.dest_end) { + grn_nfkc_normalize_context_expand(ctx, + &unify, + unified_char_length, + "[unify]"); + if (ctx->rc != GRN_SUCCESS) { + goto exit; + } + } + + grn_memcpy(unify.d, unifying, unified_char_length); + unify.d_ = unify.d; + unify.d += unified_char_length; + unify.n_characters++; + if (unify.t) { + *(unify.t++) = char_type; + } + if (unify.c) { + size_t i; + *(unify.c++) += data->context.checks[i_byte]; + for (i = 1; i < unified_char_length; i++) { + *(unify.c++) = 0; + } + unify.c[0] = 0; + } + if (unify.o) { + *(unify.o++) = data->context.offsets[i_character]; + } + } else { + if (unify.d + n_romajis >= unify.dest_end) { + grn_nfkc_normalize_context_expand(ctx, + &unify, + n_romajis, + "[unify][romaji]"); + if (ctx->rc != GRN_SUCCESS) { + goto exit; + } + } + + grn_memcpy(unify.d, romaji, n_romajis); + + unify.d += n_romajis; + unify.d_ = unify.d - 1; + unify.n_characters += n_romajis; + if (unify.t) { + size_t i; + for (i = 0; i < n_romajis; i++) { + *(unify.t++) = GRN_CHAR_ALPHA; + } + } + if (unify.c) { + size_t i; + *(unify.c++) += data->context.checks[i_byte]; + for (i = 1; i < n_romajis; i++) { + *(unify.c++) = -1; + } + unify.c[0] = 0; + } + if (unify.o) { + size_t i; + for (i = 0; i < n_romajis; i++) { + *(unify.o++) = data->context.offsets[i_character]; + } } - unify.c[0] = 0; - } - if (unify.o) { - *(unify.o++) = data->context.offsets[i_character]; } } Added: test/command/suite/normalizers/nfkc100/unify_to_romaji.expected (+115 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji.expected 2018-11-05 18:36:05 +0900 (ae8fb3664) @@ -0,0 +1,115 @@ +normalize 'NormalizerNFKC100("unify_to_romaji", true, "report_source_offset", true)' "あイウェおざジたチなニぱピまミヽヾ漢字" WITH_CHECKS|WITH_TYPES +[ + [ + 0, + 0.0, + 0.0 + ], + { + "normalized": "aiuxeozazitatinanipapimamiヽヾ漢字", + "types": [ + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "alpha", + "katakana", + "katakana", + "kanji", + "kanji" + ], + "checks": [ + 3, + 3, + 3, + 3, + -1, + 3, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + -1, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0, + 3, + 0, + 0 + ], + "offsets": [ + 0, + 3, + 6, + 9, + 9, + 12, + 15, + 15, + 18, + 18, + 21, + 21, + 24, + 24, + 27, + 27, + 30, + 30, + 33, + 33, + 36, + 36, + 39, + 39, + 42, + 42, + 45, + 48, + 51, + 54 + ] + } +] Added: test/command/suite/normalizers/nfkc100/unify_to_romaji.test (+5 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/normalizers/nfkc100/unify_to_romaji.test 2018-11-05 18:36:05 +0900 (d1236b355) @@ -0,0 +1,5 @@ +normalize \ + 'NormalizerNFKC100("unify_to_romaji", true, \ + "report_source_offset", true)' \ + "あイウェおざジたチなニぱピまミヽヾ漢字" \ + WITH_CHECKS|WITH_TYPES -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181105/31b4a42e/attachment-0001.html>