groonga/groonga at 8d4e447 [master] normalizer: use GRN_CHAR_TYPE (#913) (Groonga-commit) - Groonga - fulltext search engine.

Yasuhiro Horimoto	2019-03-20 18:00:43 +0900 (Wed, 20 Mar 2019)

  Revision: 8d4e44787dc4887fde1a15e664abb41483814c0d
  https://github.com/groonga/groonga/commit/8d4e44787dc4887fde1a15e664abb41483814c0d

  Message:
    normalizer: use GRN_CHAR_TYPE (#913)
    
    * normalizer: use GRN_CHAR_TYPE
    
    Because char_type is sometimes OR operated with GRN_CHAR_BLANK.
    
    Therefore, when load data that there is whitespace after KATAKANA,
    load data are not normalize.
    Because KATAKANA before whitespace is operated OR with GRN_CHAR_BLANK,
    it can not be decided as KATAKANA.
    
    Fix issues #912
    
    * normalizer: add a missing flag of character type
    
    * test: add test for unify_kana option

  Added files:
    test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected
    test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+8 -8)
===================================================================

--- lib/normalizer.c    2019-03-20 15:13:18 +0900 (2d8b2473f)
+++ lib/normalizer.c    2019-03-20 18:00:43 +0900 (c2b43452e)
@@ -1141,16 +1141,16 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
     }
 
     if (data->options->unify_kana &&
-        char_type == GRN_CHAR_KATAKANA &&
+        GRN_CHAR_TYPE(char_type) == GRN_CHAR_KATAKANA &&
         unified_char_length == 3) {
       unifying = grn_nfkc_normalize_unify_kana(unifying, unified_kana);
       if (unifying == unified_kana) {
-        char_type = GRN_CHAR_HIRAGANA;
+        char_type = GRN_CHAR_HIRAGANA | (char_type & GRN_CHAR_BLANK);
       }
     }
 
     if (data->options->unify_kana_case) {
-      switch (char_type) {
+      switch (GRN_CHAR_TYPE(char_type)) {
       case GRN_CHAR_HIRAGANA :
         if (unified_char_length == 3) {
           unifying = grn_nfkc_normalize_unify_hiragana_case(unifying,
@@ -1169,7 +1169,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
     }
 
     if (data->options->unify_kana_voiced_sound_mark) {
-      switch (char_type) {
+      switch (GRN_CHAR_TYPE(char_type)) {
       case GRN_CHAR_HIRAGANA :
         if (unified_char_length == 3) {
           unifying = grn_nfkc_normalize_unify_hiragana_voiced_sound_mark(
@@ -1191,7 +1191,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
       if (grn_nfkc_normalize_is_hyphen_famity(unifying, unified_char_length)) {
         unifying = unified_hyphen;
         unified_char_length = sizeof(unified_hyphen);
-        char_type = GRN_CHAR_SYMBOL;
+        char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK);
       }
     }
 
@@ -1200,7 +1200,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
                                                             unified_char_length)) {
         unifying = unified_prolonged_sound_mark;
         unified_char_length = sizeof(unified_prolonged_sound_mark);
-        char_type = GRN_CHAR_KATAKANA;
+        char_type = GRN_CHAR_KATAKANA | (char_type & GRN_CHAR_BLANK);
       }
     }
 
@@ -1210,7 +1210,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
                                                             unified_char_length)) {
         unifying = unified_hyphen;
         unified_char_length = sizeof(unified_hyphen);
-        char_type = GRN_CHAR_SYMBOL;
+        char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK);
       }
     }
 
@@ -1219,7 +1219,7 @@ grn_nfkc_normalize_unify_stateless(grn_ctx *ctx,
                                                   unified_char_length)) {
         unifying = unified_middle_dot;
         unified_char_length = sizeof(unified_middle_dot);
-        char_type = GRN_CHAR_SYMBOL;
+        char_type = GRN_CHAR_SYMBOL | (char_type & GRN_CHAR_BLANK);
       }
     }
 

  Added: test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected (+45 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.expected    2019-03-20 18:00:43 +0900 (022d7c9a2)
@@ -0,0 +1,45 @@
+table_create Memos --flags TABLE_HASH_KEY --key_type ShortText
+[[0,0.0,0.0],true]
+column_create --table Memos --name content --type LongText
+[[0,0.0,0.0],true]
+table_create   --name Term   --flags TABLE_PAT_KEY   --key_type ShortText   --default_tokenizer TokenBigram   --normalizer "NormalizerNFKC100(\"unify_kana\", true)"
+[[0,0.0,0.0],true]
+column_create   --table Term   --name content   --flags COLUMN_INDEX|WITH_POSITION   --type Memos   --source content
+[[0,0.0,0.0],true]
+load --table Memos
+{"_key":"1", "content":"ヤマダ です"}
+[[0,0.0,0.0],1]
+select Memos --filter 'content @ "やまだ"'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "content",
+          "LongText"
+        ]
+      ],
+      [
+        1,
+        "1",
+        "ヤマダ です"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/match/normalizer_nfkc100_unify_kana.test    2019-03-20 18:00:43 +0900 (fad547466)
@@ -0,0 +1,20 @@
+table_create Memos --flags TABLE_HASH_KEY --key_type ShortText
+column_create --table Memos --name content --type LongText
+
+table_create \
+  --name Term \
+  --flags TABLE_PAT_KEY \
+  --key_type ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer "NormalizerNFKC100(\"unify_kana\", true)"
+column_create \
+  --table Term \
+  --name content \
+  --flags COLUMN_INDEX|WITH_POSITION \
+  --type Memos \
+  --source content
+
+load --table Memos
+{"_key":"1", "content":"ヤマダ です"}
+
+select Memos --filter 'content @ "やまだ"'
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190320/4fd45129/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 8d4e447 [master] normalizer: use GRN_CHAR_TYPE (#913)