groonga/groonga at c60cb28 [master] TokenPattern: add a new tokenizer that extract tokens by regular expression (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2019-02-01 12:20:12 +0900 (Fri, 01 Feb 2019)

  Revision: c60cb28b0cdd9667c420e65bc0ca9c1aa9b0624a
  https://github.com/groonga/groonga/commit/c60cb28b0cdd9667c420e65bc0ca9c1aa9b0624a

  Message:
    TokenPattern: add a new tokenizer that extract tokens by regular expression

  Added files:
    test/command/suite/tokenizers/pattern/match.expected
    test/command/suite/tokenizers/pattern/match.test
    test/command/suite/tokenizers/pattern/no_pattern.expected
    test/command/suite/tokenizers/pattern/no_pattern.test
    test/command/suite/tokenizers/pattern/not_match.expected
    test/command/suite/tokenizers/pattern/not_match.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+221 -1)
===================================================================

--- lib/tokenizers.c    2019-02-01 10:15:09 +0900 (bc252d6df)
+++ lib/tokenizers.c    2019-02-01 12:20:12 +0900 (920141519)
@@ -1,7 +1,7 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
   Copyright(C) 2009-2018 Brazil
-  Copyright(C) 2018 Kouhei Sutou <kou****@clear*****>
+  Copyright(C) 2018-2019 Kouhei Sutou <kou****@clear*****>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -1542,6 +1542,219 @@ regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   return NULL;
 }
 
+/* pattern tokenizer */
+
+typedef struct {
+#ifdef GRN_SUPPORT_REGEXP
+  OnigRegex regex;
+#else /* GRN_SUPPORT_REGEXP */
+  void *regex;
+#endif /* GRN_SUPPORT_REGEXP */
+} grn_pattern_options;
+
+typedef struct {
+  grn_tokenizer_token token;
+  grn_tokenizer_query *query;
+  grn_pattern_options *options;
+  grn_bool have_tokenized_delimiter;
+  grn_encoding encoding;
+  const unsigned char *start;
+  const unsigned char *next;
+  const unsigned char *end;
+} grn_pattern_tokenizer;
+
+static void
+pattern_options_init(grn_pattern_options *options)
+{
+  options->regex = NULL;
+}
+
+static void *
+pattern_open_options(grn_ctx *ctx,
+                     grn_obj *tokenizer,
+                     grn_obj *raw_options,
+                     void *user_data)
+{
+  grn_pattern_options *options;
+
+  options = GRN_MALLOC(sizeof(grn_pattern_options));
+  if (!options) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][pattern] "
+        "failed to allocate memory for options");
+    return NULL;
+  }
+
+  pattern_options_init(options);
+  GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
+    grn_raw_string name_raw;
+    name_raw.value = name;
+    name_raw.length = name_length;
+
+    if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "pattern")) {
+#ifdef GRN_SUPPORT_REGEXP
+      const char *pattern;
+      unsigned int pattern_length;
+      grn_id domain;
+
+      pattern_length = grn_vector_get_element(ctx,
+                                              raw_options,
+                                              i,
+                                              &pattern,
+                                              NULL,
+                                              &domain);
+      if (grn_type_id_is_text_family(ctx, domain) && pattern_length > 0) {
+        if (options->regex) {
+          onig_free(options->regex);
+        }
+        options->regex = grn_onigmo_new(ctx,
+                                        pattern,
+                                        pattern_length,
+                                        GRN_ONIGMO_OPTION_DEFAULT,
+                                        GRN_ONIGMO_SYNTAX_DEFAULT,
+                                        "[tokenizer][delimit]");
+      }
+#endif /* GRN_SUPPORT_REGEXP */
+    }
+  } GRN_OPTION_VALUES_EACH_END();
+
+  return options;
+}
+
+static void
+pattern_close_options(grn_ctx *ctx, void *data)
+{
+  grn_pattern_options *options = data;
+
+#ifdef GRN_SUPPORT_REGEXP
+  if (options->regex) {
+    onig_free(options->regex);
+  }
+#endif /* GRN_SUPPORT_REGEXP */
+  GRN_FREE(options);
+}
+
+static void *
+pattern_init(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  grn_obj *lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+  grn_pattern_options *options;
+  grn_pattern_tokenizer *tokenizer;
+
+  options = grn_table_cache_default_tokenizer_options(ctx,
+                                                      lexicon,
+                                                      pattern_open_options,
+                                                      pattern_close_options,
+                                                      NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    return NULL;
+  }
+
+  if (!(tokenizer = GRN_MALLOC(sizeof(grn_pattern_tokenizer)))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[tokenizer][pattern] "
+        "memory allocation to grn_pattern_tokenizer failed");
+    return NULL;
+  }
+
+  tokenizer->query = query;
+  tokenizer->options = options;
+
+  {
+    const char *raw_string;
+    size_t raw_string_length;
+    grn_encoding encoding;
+
+    raw_string = grn_tokenizer_query_get_raw_string(ctx,
+                                                    tokenizer->query,
+                                                    &raw_string_length);
+    encoding = grn_tokenizer_query_get_encoding(ctx, tokenizer->query);
+    tokenizer->have_tokenized_delimiter =
+      grn_tokenizer_have_tokenized_delimiter(ctx,
+                                             raw_string,
+                                             raw_string_length,
+                                             encoding);
+    tokenizer->encoding = encoding;
+  }
+  {
+    grn_obj *string;
+    const char *normalized;
+    unsigned int normalized_length_in_bytes;
+
+    string = grn_tokenizer_query_get_normalized_string(ctx, tokenizer->query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized, &normalized_length_in_bytes,
+                              NULL);
+    tokenizer->start = (const unsigned char *)normalized;
+    tokenizer->next = tokenizer->start;
+    tokenizer->end = tokenizer->start + normalized_length_in_bytes;
+  }
+
+  return tokenizer;
+}
+
+static void
+pattern_next(grn_ctx *ctx,
+             grn_tokenizer_query *query,
+             grn_token *token,
+             void *user_data)
+{
+  grn_pattern_tokenizer *tokenizer = user_data;
+
+  if (tokenizer->have_tokenized_delimiter) {
+    unsigned int rest_length;
+    rest_length = tokenizer->end - tokenizer->next;
+    tokenizer->next =
+      (unsigned char *)grn_tokenizer_next_by_tokenized_delimiter(
+        ctx,
+        token,
+        (const char *)tokenizer->next,
+        rest_length,
+        tokenizer->encoding);
+#ifdef GRN_SUPPORT_REGEXP
+  } else if (tokenizer->options->regex) {
+    OnigPosition position;
+    OnigRegion region;
+
+    onig_region_init(&region);
+    position = onig_search(tokenizer->options->regex,
+                           tokenizer->start,
+                           tokenizer->end,
+                           tokenizer->next,
+                           tokenizer->end,
+                           &region,
+                           ONIG_OPTION_NONE);
+    if (position == ONIG_MISMATCH) {
+      grn_token_set_data(ctx, token, NULL, 0);
+      grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
+    } else {
+      grn_token_set_data(ctx,
+                         token,
+                         tokenizer->start + region.beg[0],
+                         region.end[0] - region.beg[0]);
+      grn_token_set_status(ctx, token, GRN_TOKEN_CONTINUE);
+      tokenizer->next = tokenizer->start + region.end[0];
+      onig_region_free(&region, 0);
+    }
+#endif /* GRN_SUPPORT_REGEXP */
+  } else {
+    grn_token_set_data(ctx, token, NULL, 0);
+    grn_token_set_status(ctx, token, GRN_TOKEN_LAST);
+  }
+}
+
+static void
+pattern_fin(grn_ctx *ctx, void *user_data)
+{
+  grn_pattern_tokenizer *tokenizer = user_data;
+
+  if (!tokenizer) {
+    return;
+  }
+  GRN_FREE(tokenizer);
+}
+
 /* external */
 
 grn_rc
@@ -1729,6 +1942,13 @@ grn_db_init_builtin_tokenizers(grn_ctx *ctx)
     grn_tokenizer_set_next_func(ctx, tokenizer, ngram_next);
     grn_tokenizer_set_fin_func(ctx, tokenizer, ngram_fin);
   }
+  {
+    grn_obj *tokenizer;
+    tokenizer = grn_tokenizer_create(ctx, "TokenPattern", -1);
+    grn_tokenizer_set_init_func(ctx, tokenizer, pattern_init);
+    grn_tokenizer_set_next_func(ctx, tokenizer, pattern_next);
+    grn_tokenizer_set_fin_func(ctx, tokenizer, pattern_fin);
+  }
 
   return GRN_SUCCESS;
 }

  Added: test/command/suite/tokenizers/pattern/match.expected (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/match.expected    2019-02-01 12:20:12 +0900 (cad9e5ee2)
@@ -0,0 +1,22 @@
+tokenize   'TokenPattern("pattern", "\\\\d+円")'   "私は100円のりんごを29円で買いました。"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "100円",
+      "position": 0,
+      "force_prefix": false,
+      "force_prefix_search": false
+    },
+    {
+      "value": "29円",
+      "position": 1,
+      "force_prefix": false,
+      "force_prefix_search": false
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/pattern/match.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/match.test    2019-02-01 12:20:12 +0900 (b9d6973d1)
@@ -0,0 +1,3 @@
+tokenize \
+  'TokenPattern("pattern", "\\\\d+円")' \
+  "私は100円のりんごを29円で買いました。"

  Added: test/command/suite/tokenizers/pattern/no_pattern.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/no_pattern.expected    2019-02-01 12:20:12 +0900 (97514c8f6)
@@ -0,0 +1,2 @@
+tokenize   'TokenPattern'   "This is a pen."
+[[0,0.0,0.0],[]]

  Added: test/command/suite/tokenizers/pattern/no_pattern.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/no_pattern.test    2019-02-01 12:20:12 +0900 (f20a229b8)
@@ -0,0 +1,3 @@
+tokenize \
+  'TokenPattern' \
+  "This is a pen."

  Added: test/command/suite/tokenizers/pattern/not_match.expected (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/not_match.expected    2019-02-01 12:20:12 +0900 (dec2e8655)
@@ -0,0 +1,2 @@
+tokenize   'TokenPattern("pattern", "nonexistent")'   "This is a pen."
+[[0,0.0,0.0],[]]

  Added: test/command/suite/tokenizers/pattern/not_match.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/pattern/not_match.test    2019-02-01 12:20:12 +0900 (a958db0e4)
@@ -0,0 +1,3 @@
+tokenize \
+  'TokenPattern("pattern", "nonexistent")' \
+  "This is a pen."
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190201/06eccc3e/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at c60cb28 [master] TokenPattern: add a new tokenizer that extract tokens by regular expression