Yasuhiro Horimoto 2019-03-20 18:00:43 +0900 (Wed, 20 Mar 2019) Revision: 8d4e44787dc4887fde1a15e664abb41483814c0d https://github.com/groonga/groonga/commit/8d4e44787dc4887fde1a15e664abb41483814c0d Message: normalizer: use GRN_CHAR_TYPE (#913) * normalizer: use GRN_CHAR_TYPE Because char_type is sometimes OR operated with GRN_CHAR_BLANK. Therefore, when load data that there is whitespace after KATAKANA, load data are not normalize. Because KATAKANA before whitespace is operated OR with GRN_CHAR_BLANK, it can not be decided as KATAKANA. Fix issues #912 * normalizer: add a missing flag of character type * test: add test for unify_kana option Added files: test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test Modified files: lib/normalizer.c Modified: lib/normalizer.c (+8 -8) =================================================================== --- lib/normalizer.c 2019-03-20 15:13:18 +0900 (2d8b2473f) +++ lib/normalizer.c 2019-03-20 18:00:43 +0900 (c2b43452e) @@ -1141,16 +1141,16 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, } if (data->options->unify_kana && - char_type == GRN_CHAR_KATAKANA && + GRN_CHAR_TYPE(char_type) == GRN_CHAR_KATAKANA && unified_char_length == 3) { unifying = grn_nfkc_normalize_unify_kana(unifying, unified_kana); if (unifying == unified_kana) { - char_type = GRN_CHAR_HIRAGANA; + char_type = GRN_CHAR_HIRAGANA | (char_type & GRN_CHAR_BLANK); } } if (data->options->unify_kana_case) { - switch (char_type) { + switch (GRN_CHAR_TYPE(char_type)) { case GRN_CHAR_HIRAGANA : if (unified_char_length == 3) { unifying = grn_nfkc_normalize_unify_hiragana_case(unifying, @@ -1169,7 +1169,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, } if (data->options->unify_kana_voiced_sound_mark) { - switch (char_type) { + switch (GRN_CHAR_TYPE(char_type)) { case GRN_CHAR_HIRAGANA : if (unified_char_length == 3) { unifying = grn_nfkc_normalize_unify_hiragana_voiced_sound_mark( @@ -1191,7 +1191,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, if (grn_nfkc_normalize_is_hyphen_famity(unifying, unified_char_length)) { unifying = unified_hyphen; unified_char_length = sizeof(unified_hyphen); - char_type = GRN_CHAR_SYMBOL; + char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK); } } @@ -1200,7 +1200,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, unified_char_length)) { unifying = unified_prolonged_sound_mark; unified_char_length = sizeof(unified_prolonged_sound_mark); - char_type = GRN_CHAR_KATAKANA; + char_type = GRN_CHAR_KATAKANA | (char_type & GRN_CHAR_BLANK); } } @@ -1210,7 +1210,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, unified_char_length)) { unifying = unified_hyphen; unified_char_length = sizeof(unified_hyphen); - char_type = GRN_CHAR_SYMBOL; + char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK); } } @@ -1219,7 +1219,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx, unified_char_length)) { unifying = unified_middle_dot; unified_char_length = sizeof(unified_middle_dot); - char_type = GRN_CHAR_SYMBOL; + char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK); } } Added: test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected (+45 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected 2019-03-20 18:00:43 +0900 (022d7c9a2) @@ -0,0 +1,45 @@ +table_create Memos --flags TABLE_HASH_KEY --key_type ShortText +[[0,0.0,0.0],true] +column_create --table Memos --name content --type LongText +[[0,0.0,0.0],true] +table_create --name Term --flags TABLE_PAT_KEY --key_type ShortText --default_tokenizer TokenBigram --normalizer "NormalizerNFKC100(\"unify_kana\", true)" +[[0,0.0,0.0],true] +column_create --table Term --name content --flags COLUMN_INDEX|WITH_POSITION --type Memos --source content +[[0,0.0,0.0],true] +load --table Memos +{"_key":"1", "content":"ヤマダ です"} +[[0,0.0,0.0],1] +select Memos --filter 'content @ "やまだ"' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "_key", + "ShortText" + ], + [ + "content", + "LongText" + ] + ], + [ + 1, + "1", + "ヤマダ です" + ] + ] + ] +] Added: test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test 2019-03-20 18:00:43 +0900 (fad547466) @@ -0,0 +1,20 @@ +table_create Memos --flags TABLE_HASH_KEY --key_type ShortText +column_create --table Memos --name content --type LongText + +table_create \ + --name Term \ + --flags TABLE_PAT_KEY \ + --key_type ShortText \ + --default_tokenizer TokenBigram \ + --normalizer "NormalizerNFKC100(\"unify_kana\", true)" +column_create \ + --table Term \ + --name content \ + --flags COLUMN_INDEX|WITH_POSITION \ + --type Memos \ + --source content + +load --table Memos +{"_key":"1", "content":"ヤマダ です"} + +select Memos --filter 'content @ "やまだ"' -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190320/4fd45129/attachment-0001.html>