groonga/groonga at 87805b9 [master] normalizer: support normalizer options (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2018-04-11 16:41:39 +0900 (Wed, 11 Apr 2018)

  New Revision: 87805b9bc59dd11341fd25993e2385e8195438b4
  https://github.com/groonga/groonga/commit/87805b9bc59dd11341fd25993e2385e8195438b4

  Message:
    normalizer: support normalizer options
    
    NormalizerNFKC100 supports "unify_kana" option.

  Added files:
    test/command/suite/normalizers/nfkc100/unify_kana.expected
    test/command/suite/normalizers/nfkc100/unify_kana.test
  Modified files:
    lib/normalizer.c
    lib/proc/proc_lexicon.c
    lib/proc/proc_normalize.c

  Modified: lib/normalizer.c (+133 -23)
===================================================================

--- lib/normalizer.c    2018-04-11 16:41:12 +0900 (8ca5aa18e)
+++ lib/normalizer.c    2018-04-11 16:41:39 +0900 (cf2db157f)
@@ -20,6 +20,7 @@
 
 #include "grn_normalizer.h"
 #include "grn_string.h"
+#include "grn_raw_string.h"
 #include "grn_nfkc.h"
 #include <groonga/normalizer.h>
 #include <groonga/tokenizer.h>
@@ -615,12 +616,29 @@ typedef const char *(*grn_nfkc_decompose_func)(const unsigned char *utf8);
 typedef const char *(*grn_nfkc_compose_func)(const unsigned char *prefix_utf8,
                                              const unsigned char *suffix_utf8);
 
+typedef struct {
+  grn_nfkc_char_type_func char_type_func;
+  grn_nfkc_decompose_func decompose_func;
+  grn_nfkc_compose_func compose_func;
+  grn_bool unify_kana;
+} grn_utf8_normalize_options;
+
+static void
+utf8_normalize_options_init(grn_utf8_normalize_options *options,
+                            grn_nfkc_char_type_func char_type_func,
+                            grn_nfkc_decompose_func decompose_func,
+                            grn_nfkc_compose_func compose_func)
+{
+  options->char_type_func = char_type_func;
+  options->decompose_func = decompose_func;
+  options->compose_func = compose_func;
+  options->unify_kana = GRN_FALSE;
+}
+
 grn_inline static grn_obj *
 utf8_normalize(grn_ctx *ctx,
                grn_string *nstr,
-               grn_nfkc_char_type_func char_type_func,
-               grn_nfkc_decompose_func decompose_func,
-               grn_nfkc_compose_func compose_func)
+               grn_utf8_normalize_options *options)
 {
   int16_t *ch;
   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
@@ -668,13 +686,13 @@ utf8_normalize(grn_ctx *ctx,
                                              GRN_ENC_UTF8)) {
       continue;
     }
-    if ((p = (unsigned char *)decompose_func(s))) {
+    if ((p = (unsigned char *)options->decompose_func(s))) {
       pe = p + strlen((char *)p);
     } else {
       p = s;
       pe = p + ls;
     }
-    if (d_ && (p2 = (unsigned char *)compose_func(d_, p))) {
+    if (d_ && (p2 = (unsigned char *)options->compose_func(d_, p))) {
       p = p2;
       pe = p + strlen((char *)p);
       if (cp) { cp--; }
@@ -694,6 +712,8 @@ utf8_normalize(grn_ctx *ctx,
       if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
         if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
       } else {
+        grn_char_type char_type;
+
         if (de <= d + lp) {
           unsigned char *normalized;
           ds += (ds >> 1) + lp;
@@ -735,11 +755,32 @@ utf8_normalize(grn_ctx *ctx,
             nstr->ctypes = ctypes;
           }
         }
-        grn_memcpy(d, p, lp);
+        char_type = options->char_type_func(p);
+        if (options->unify_kana && char_type == GRN_CHAR_KATAKANA) {
+          if (lp == 3 &&
+              p[0] == 0xe3 &&
+              /* U+30A1 KATAKANA LETTER SMALL A ..
+               * U+30F6 KATAKANA LETTER SMALL KE
+               *
+               * U+30FD KATAKANA ITERATION MARK ..
+               * U+30F6 KATAKANA LETTER SMALL KE */
+              ((p[1] == 0x82 && 0xa1 <= p[2]) ||
+               (p[1] == 0x83 && p[2] <= 0xb6) ||
+               (p[1] == 0x83 && (0xbd <= p[2] && p[2] <= 0xbe)))) {
+            d[0] = p[0];
+            d[1] = p[1] - 1;
+            d[2] = p[2] ^ 0x20;
+            char_type = GRN_CHAR_HIRAGANA;
+          } else {
+            grn_memcpy(d, p, lp);
+          }
+        } else {
+          grn_memcpy(d, p, lp);
+        }
         d_ = d;
         d += lp;
         length++;
-        if (cp) { *cp++ = char_type_func(p); }
+        if (cp) { *cp++ = char_type; }
         if (ch) {
           size_t i;
           if (s_ == s + ls) {
@@ -1133,11 +1174,14 @@ auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     break;
   case GRN_ENC_UTF8 :
 #ifdef GRN_WITH_NFKC
-    utf8_normalize(ctx,
-                   string,
-                   grn_nfkc_char_type,
-                   grn_nfkc_decompose,
-                   grn_nfkc_compose);
+    {
+      grn_utf8_normalize_options options;
+      utf8_normalize_options_init(&options,
+                                  grn_nfkc_char_type,
+                                  grn_nfkc_decompose,
+                                  grn_nfkc_compose);
+      utf8_normalize(ctx, string, &options);
+    }
 #else /* GRN_WITH_NFKC */
     ascii_normalize(ctx, string);
 #endif /* GRN_WITH_NFKC */
@@ -1163,23 +1207,89 @@ static grn_obj *
 nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_string *string = (grn_string *)(args[0]);
-  utf8_normalize(ctx,
-                 string,
-                 grn_nfkc50_char_type,
-                 grn_nfkc50_decompose,
-                 grn_nfkc50_compose);
+  grn_utf8_normalize_options options;
+
+  utf8_normalize_options_init(&options,
+                              grn_nfkc50_char_type,
+                              grn_nfkc50_decompose,
+                              grn_nfkc50_compose);
+  utf8_normalize(ctx, string, &options);
   return NULL;
 }
 
+static void *
+nfkc100_open_options(grn_ctx *ctx,
+                     grn_obj *string,
+                     grn_obj *raw_options,
+                     void *user_data)
+{
+  grn_utf8_normalize_options *options;
+
+  options = GRN_MALLOC(sizeof(grn_utf8_normalize_options));
+  if (!options) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[normalizer][nfkc100] "
+        "failed to allocate memory for options");
+    return NULL;
+  }
+
+  utf8_normalize_options_init(options,
+                              grn_nfkc100_char_type,
+                              grn_nfkc100_decompose,
+                              grn_nfkc100_compose);
+
+  GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
+    grn_raw_string name_raw;
+    name_raw.value = name;
+    name_raw.length = name_length;
+
+    if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana")) {
+      options->unify_kana = grn_vector_get_element_bool(ctx,
+                                                        raw_options,
+                                                        i,
+                                                        options->unify_kana);
+    }
+  } GRN_OPTION_VALUES_EACH_END();
+
+  return options;
+}
+
+static void
+nfkc100_close_options(grn_ctx *ctx, void *data)
+{
+  grn_utf8_normalize_options *options = data;
+  GRN_FREE(options);
+}
+
 static grn_obj *
 nfkc100_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  grn_string *string = (grn_string *)(args[0]);
-  utf8_normalize(ctx,
-                 string,
-                 grn_nfkc100_char_type,
-                 grn_nfkc100_decompose,
-                 grn_nfkc100_compose);
+  grn_obj *string = args[0];
+  grn_string *string_ = (grn_string *)string;
+  grn_obj *table;
+  grn_utf8_normalize_options *options;
+  grn_utf8_normalize_options options_raw;
+
+  table = grn_string_get_table(ctx, string);
+  if (table) {
+    options = grn_table_cache_normalizer_options(ctx,
+                                                 table,
+                                                 string,
+                                                 nfkc100_open_options,
+                                                 nfkc100_close_options,
+                                                 NULL);
+    if (ctx->rc != GRN_SUCCESS) {
+      return NULL;
+    }
+  } else {
+    utf8_normalize_options_init(&options_raw,
+                                grn_nfkc100_char_type,
+                                grn_nfkc100_decompose,
+                                grn_nfkc100_compose);
+    options = &options_raw;
+  }
+
+  utf8_normalize(ctx, string_, options);
   return NULL;
 }
 #endif /* GRN_WITH_NFKC */

  Modified: lib/proc/proc_lexicon.c (+27 -35)
===================================================================
--- lib/proc/proc_lexicon.c    2018-04-11 16:41:12 +0900 (ca5bbcba1)
+++ lib/proc/proc_lexicon.c    2018-04-11 16:41:39 +0900 (a74c2aec6)
@@ -29,35 +29,6 @@ grn_proc_lexicon_open(grn_ctx *ctx,
                       const char *context_tag)
 {
   grn_obj *lexicon;
-  grn_obj *normalizer = NULL;
-
-  if (normalizer_raw->length > 0) {
-    normalizer = grn_ctx_get(ctx,
-                             normalizer_raw->value,
-                             normalizer_raw->length);
-    if (!normalizer) {
-      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
-                       "%s nonexistent normalizer: <%.*s>",
-                       context_tag,
-                       (int)normalizer_raw->length,
-                       normalizer_raw->value);
-      return NULL;
-    }
-
-    if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
-      grn_obj inspected;
-      GRN_TEXT_INIT(&inspected, 0);
-      grn_inspect(ctx, &inspected, normalizer);
-      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
-                       "%s not normalizer: %.*s",
-                       context_tag,
-                       (int)GRN_TEXT_LEN(&inspected),
-                       GRN_TEXT_VALUE(&inspected));
-      GRN_OBJ_FIN(ctx, &inspected);
-      grn_obj_unlink(ctx, normalizer);
-      return NULL;
-    }
-  }
 
   lexicon = grn_table_create(ctx, NULL, 0,
                              NULL,
@@ -67,7 +38,9 @@ grn_proc_lexicon_open(grn_ctx *ctx,
   {
     grn_obj tokenizer;
     GRN_TEXT_INIT(&tokenizer, GRN_OBJ_DO_SHALLOW_COPY);
-    GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length);
+    if (tokenizer_raw) {
+      GRN_TEXT_SET(ctx, &tokenizer, tokenizer_raw->value, tokenizer_raw->length);
+    }
     grn_obj_set_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, &tokenizer);
     GRN_OBJ_FIN(ctx, &tokenizer);
   }
@@ -81,12 +54,31 @@ grn_proc_lexicon_open(grn_ctx *ctx,
                      ctx->errbuf);
     return NULL;
   }
-  if (normalizer) {
-    grn_obj_set_info(ctx, lexicon,
-                     GRN_INFO_NORMALIZER, normalizer);
-    grn_obj_unlink(ctx, normalizer);
+  {
+    grn_obj normalizer;
+    GRN_TEXT_INIT(&normalizer, GRN_OBJ_DO_SHALLOW_COPY);
+    if (normalizer_raw) {
+      GRN_TEXT_SET(ctx,
+                   &normalizer,
+                   normalizer_raw->value,
+                 normalizer_raw->length);
+    }
+    grn_obj_set_info(ctx, lexicon, GRN_INFO_NORMALIZER, &normalizer);
+    GRN_OBJ_FIN(ctx, &normalizer);
+  }
+  if (ctx->rc != GRN_SUCCESS) {
+    grn_obj_close(ctx, lexicon);
+    GRN_PLUGIN_ERROR(ctx, ctx->rc,
+                     "%s failed to set normalizer: <%.*s>: %s",
+                     context_tag,
+                     (int)(normalizer_raw->length),
+                     normalizer_raw->value,
+                     ctx->errbuf);
+    return NULL;
+  }
+  if (token_filters_raw) {
+    grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw);
   }
-  grn_proc_table_set_token_filters(ctx, lexicon, token_filters_raw);
 
   return lexicon;
 }

  Modified: lib/proc/proc_normalize.c (+45 -54)
===================================================================
--- lib/proc/proc_normalize.c    2018-04-11 16:41:12 +0900 (998a8030c)
+++ lib/proc/proc_normalize.c    2018-04-11 16:41:39 +0900 (63f061e49)
@@ -18,20 +18,17 @@
 
 #include "../grn_proc.h"
 #include "../grn_ctx.h"
-#include "../grn_token_cursor.h"
 
 #include <groonga/plugin.h>
 
 static int
-parse_normalize_flags(grn_ctx *ctx, grn_obj *flag_names)
+parse_normalize_flags(grn_ctx *ctx, grn_raw_string *flags_raw)
 {
   int flags = 0;
   const char *names, *names_end;
-  int length;
 
-  names = GRN_TEXT_VALUE(flag_names);
-  length = GRN_TEXT_LEN(flag_names);
-  names_end = names + length;
+  names = flags_raw->value;
+  names_end = names + flags_raw->length;
   while (names < names_end) {
     if (*names == '|' || *names == ' ') {
       names += 1;
@@ -64,77 +61,69 @@ parse_normalize_flags(grn_ctx *ctx, grn_obj *flag_names)
   return flags;
 }
 
-static grn_bool
-is_normalizer(grn_ctx *ctx, grn_obj *object)
-{
-  if (object->header.type != GRN_PROC) {
-    return GRN_FALSE;
-  }
-
-  if (grn_proc_get_type(ctx, object) != GRN_PROC_NORMALIZER) {
-    return GRN_FALSE;
-  }
-
-  return GRN_TRUE;
-}
-
 static grn_obj *
 command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
-  grn_obj *normalizer_name;
-  grn_obj *string;
-  grn_obj *flag_names;
-
-  normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1);
-  string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
-  flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
-  if (GRN_TEXT_LEN(normalizer_name) == 0) {
-    ERR(GRN_INVALID_ARGUMENT, "normalizer name is missing");
+  const char *context_tag = "[normalize]";
+  grn_raw_string normalizer_raw;
+  grn_raw_string string_raw;
+  grn_raw_string flags_raw;
+
+#define GET_VALUE(name)                                         \
+  name ## _raw.value =                                          \
+    grn_plugin_proc_get_var_string(ctx,                         \
+                                   user_data,                   \
+                                   #name,                       \
+                                   strlen(#name),               \
+                                   &(name ## _raw.length))
+
+  GET_VALUE(normalizer);
+  GET_VALUE(string);
+  GET_VALUE(flags);
+
+#undef GET_VALUE
+
+  if (normalizer_raw.length == 0) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "%s normalizer name is missing",
+                     context_tag);
     return NULL;
   }
 
   {
-    grn_obj *normalizer;
-    grn_obj *grn_string;
     int flags;
+    grn_obj *lexicon;
+    grn_obj *grn_string;
     unsigned int normalized_length_in_bytes;
     unsigned int normalized_n_characters;
 
-    flags = parse_normalize_flags(ctx, flag_names);
-    normalizer = grn_ctx_get(ctx,
-                             GRN_TEXT_VALUE(normalizer_name),
-                             GRN_TEXT_LEN(normalizer_name));
-    if (!normalizer) {
-      ERR(GRN_INVALID_ARGUMENT,
-          "[normalize] nonexistent normalizer: <%.*s>",
-          (int)GRN_TEXT_LEN(normalizer_name),
-          GRN_TEXT_VALUE(normalizer_name));
+    flags = parse_normalize_flags(ctx, &flags_raw);
+    if (ctx->rc != GRN_SUCCESS) {
       return NULL;
     }
 
-    if (!is_normalizer(ctx, normalizer)) {
-      grn_obj inspected;
-      GRN_TEXT_INIT(&inspected, 0);
-      grn_inspect(ctx, &inspected, normalizer);
-      ERR(GRN_INVALID_ARGUMENT,
-          "[normalize] not normalizer: %.*s",
-          (int)GRN_TEXT_LEN(&inspected),
-          GRN_TEXT_VALUE(&inspected));
-      GRN_OBJ_FIN(ctx, &inspected);
-      grn_obj_unlink(ctx, normalizer);
+    lexicon = grn_proc_lexicon_open(ctx,
+                                    NULL,
+                                    &normalizer_raw,
+                                    NULL,
+                                    context_tag);
+    if (!lexicon) {
       return NULL;
     }
 
     grn_string = grn_string_open(ctx,
-                                 GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
-                                 normalizer, flags);
-    grn_obj_unlink(ctx, normalizer);
+                                 string_raw.value,
+                                 string_raw.length,
+                                 lexicon,
+                                 flags);
 
     grn_ctx_output_map_open(ctx, "RESULT", 3);
     {
       const char *normalized;
 
-      grn_string_get_normalized(ctx, grn_string,
+      grn_string_get_normalized(ctx,
+                                grn_string,
                                 &normalized,
                                 &normalized_length_in_bytes,
                                 &normalized_n_characters);
@@ -178,6 +167,8 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
     grn_ctx_output_map_close(ctx);
 
     grn_obj_unlink(ctx, grn_string);
+
+    grn_obj_unlink(ctx, lexicon);
   }
 
   return NULL;

  Added: test/command/suite/normalizers/nfkc100/unify_kana.expected (+23 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana.expected    2018-04-11 16:41:39 +0900 (d1f315517)
@@ -0,0 +1,23 @@
+normalize   'NormalizerNFKC100("unify_kana", true)'   "あイｳｪおヽヾ"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "あいうぇおゝゞ",
+    "types": [
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana",
+      "hiragana"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_kana.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_kana.test    2018-04-11 16:41:39 +0900 (818bc56d3)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_kana", true)' \
+  "あイｳｪおヽヾ" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180411/47b26cd0/attachment-0001.htm 


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 87805b9 [master] normalizer: support normalizer options