[Groonga-commit] groonga/groonga at 9d06ca0 [master] ii regexp: support multiple ".*" in one pattern

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Apr 13 11:50:06 JST 2017


Kouhei Sutou	2017-04-13 11:50:06 +0900 (Thu, 13 Apr 2017)

  New Revision: 9d06ca06a872700d361c1fbecd0b71bc2daafa82
  https://github.com/groonga/groonga/commit/9d06ca06a872700d361c1fbecd0b71bc2daafa82

  Message:
    ii regexp: support multiple ".*" in one pattern

  Added files:
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected
    test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test
  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+62 -47)
===================================================================
--- lib/ii.c    2017-04-13 10:10:19 +0900 (2dcdeeb)
+++ lib/ii.c    2017-04-13 11:50:06 +0900 (284f472)
@@ -7712,9 +7712,11 @@ typedef struct {
   const char *string;
   unsigned int string_len;
   grn_bool done;
+  grn_ii_select_cursor_posting unshifted_posting;
+  grn_bool have_unshifted_posting;
 } grn_ii_select_cursor;
 
-grn_rc
+static grn_rc
 grn_ii_select_cursor_close(grn_ctx *ctx,
                            grn_ii_select_cursor *cursor)
 {
@@ -7738,7 +7740,7 @@ grn_ii_select_cursor_close(grn_ctx *ctx,
   return GRN_SUCCESS;
 }
 
-grn_ii_select_cursor *
+static grn_ii_select_cursor *
 grn_ii_select_cursor_open(grn_ctx *ctx,
                           grn_ii *ii,
                           const char *string,
@@ -7845,10 +7847,12 @@ grn_ii_select_cursor_open(grn_ctx *ctx,
 
   cursor->done = GRN_FALSE;
 
+  cursor->have_unshifted_posting = GRN_FALSE;
+
   return cursor;
 }
 
-grn_ii_select_cursor_posting *
+static grn_ii_select_cursor_posting *
 grn_ii_select_cursor_next(grn_ctx *ctx,
                           grn_ii_select_cursor *cursor)
 {
@@ -7859,6 +7863,11 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
   int max_interval = cursor->max_interval;
   grn_operator mode = cursor->mode;
 
+  if (cursor->have_unshifted_posting) {
+    cursor->have_unshifted_posting = GRN_FALSE;
+    return &(cursor->unshifted_posting);
+  }
+
   if (cursor->done) {
     return NULL;
   }
@@ -7885,29 +7894,48 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
     }
 
     if (tip == tie) {
-      int n_occurs = 0;
       int start_pos = 0;
+      int pos = 0;
       int end_pos = 0;
       int score = 0;
+      int tf = 0;
       int tscore = 0;
 
 #define SKIP_OR_BREAK(pos) {\
-  if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; }    \
+  if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \
   if (ti->p->rid != rid || ti->p->sid != sid) { \
     next_rid = ti->p->rid; \
     next_sid = ti->p->sid; \
     break; \
   } \
 }
+
+#define RETURN_POSTING() do { \
+  cursor->posting.rid = rid; \
+  cursor->posting.sid = sid; \
+  cursor->posting.start_pos = start_pos; \
+  cursor->posting.end_pos = end_pos; \
+  cursor->posting.tf = tf; \
+  cursor->posting.weight = tscore; \
+  if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \
+    if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \
+      cursor->done = GRN_TRUE; \
+    } \
+  } \
+  return &(cursor->posting); \
+} while (GRN_FALSE)
+
       if (n_tis == 1) {
-        n_occurs = (*tis)->p->tf;
+        start_pos = pos = end_pos = (*tis)->p->pos;
+        pos++;
+        tf = (*tis)->p->tf;
         tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight;
-        start_pos = end_pos = (*tis)->p->pos;
+        RETURN_POSTING();
       } else if (mode == GRN_OP_NEAR) {
         bt_zap(bt);
         for (tip = tis; tip < tie; tip++) {
           token_info *ti = *tip;
-          SKIP_OR_BREAK(end_pos);
+          SKIP_OR_BREAK(pos);
           bt_push(bt, ti);
         }
         if (tip == tie) {
@@ -7937,7 +7965,8 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
               return NULL;
             }
             if ((max_interval < 0) || (max - min <= max_interval)) {
-              n_occurs++;
+              /* TODO: Set start_pos, pos, end_pos, tf and tscore */
+              RETURN_POSTING();
               if (ti->pos == max + 1) {
                 break;
               }
@@ -7958,41 +7987,27 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
 
           if (tip == tie) { tip = tis; }
           ti = *tip;
-          SKIP_OR_BREAK(end_pos);
-          if (ti->pos == end_pos) {
+          SKIP_OR_BREAK(pos);
+          if (ti->pos == pos) {
             score += ti->p->weight + ti->cursors->bins[0]->weight;
             count++;
           } else {
             score = ti->p->weight + ti->cursors->bins[0]->weight;
             count = 1;
-            if (start_pos == 0) {
-              start_pos = ti->pos;
-            }
-            end_pos = ti->pos;
+            start_pos = pos = ti->pos;
+            end_pos = ti->p->pos;
           }
           if (count == n_tis) {
+            pos++;
+            if (ti->p->pos > end_pos) {
+              end_pos = ti->p->pos;
+            }
+            tf = 1;
             tscore += score;
-            score = 0;
-            count = 0;
-            end_pos++;
-            n_occurs++;
+            RETURN_POSTING();
           }
         }
       }
-      if (n_occurs > 0) {
-        cursor->posting.rid = rid;
-        cursor->posting.sid = sid;
-        cursor->posting.start_pos = start_pos;
-        cursor->posting.end_pos = end_pos;
-        cursor->posting.tf = n_occurs;
-        cursor->posting.weight = tscore;
-        if (token_info_skip_pos(ctx, *tis, rid, sid, end_pos + 1) != GRN_SUCCESS) {
-          if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) {
-            cursor->done = GRN_TRUE;
-          }
-        }
-        return &(cursor->posting);
-      }
 #undef SKIP_OR_BREAK
     }
     if (token_info_skip(ctx, *tis, next_rid, next_sid)) {
@@ -8001,6 +8016,15 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
   }
 }
 
+static void
+grn_ii_select_cursor_unshift(grn_ctx *ctx,
+                             grn_ii_select_cursor *cursor,
+                             grn_ii_select_cursor_posting *posting)
+{
+  cursor->unshifted_posting = *posting;
+  cursor->have_unshifted_posting = GRN_TRUE;
+}
+
 static grn_rc
 grn_ii_parse_regexp_query(grn_ctx *ctx,
                           const char *log_tag,
@@ -8131,8 +8155,6 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
     int i;
     grn_ii_select_cursor **cursors;
     grn_bool have_error = GRN_FALSE;
-    int keep_i = 0;
-    grn_ii_select_cursor_posting keep_posting;
 
     cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings);
     for (i = 0; i < n_parsed_strings; i++) {
@@ -8169,26 +8191,19 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
         grn_ii_select_cursor_posting *posting_i;
 
         for (;;) {
-          if (keep_i == i) {
-            posting_i = &keep_posting;
-            keep_i = 0;
-          } else {
-            posting_i = grn_ii_select_cursor_next(ctx, cursors[i]);
-            if (!posting_i) {
-              break;
-            }
+          posting_i = grn_ii_select_cursor_next(ctx, cursors[i]);
+          if (!posting_i) {
+            break;
           }
 
           if (posting_i->rid == posting->rid &&
               posting_i->sid == posting->sid &&
               posting_i->start_pos > pos) {
-            keep_i = i;
-            keep_posting = *posting_i;
+            grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i);
             break;
           }
           if (posting_i->rid > posting->rid) {
-            keep_i = i;
-            keep_posting = *posting_i;
+            grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i);
             break;
           }
         }

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected (+60 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected    2017-04-13 11:50:06 +0900 (791c90e)
@@ -0,0 +1,60 @@
+table_create Properties TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Properties content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --normalizer NormalizerAuto   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Properties content
+[[0,0.0,0.0],true]
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "appname:application1"}
+]
+[[0,0.0,0.0],4]
+log_level --level info
+[[0,0.0,0.0],true]
+select Properties   --filter 'content @~ "app.*:.*pp.*"'   --output_columns content,_score
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "content",
+          "ShortText"
+        ],
+        [
+          "_score",
+          "Int32"
+        ]
+      ],
+      [
+        "app:apple",
+        1
+      ],
+      [
+        "appname:application1",
+        1
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > (app.*:.*pp.*)
+#|i| [ii][select][cursor][open] n=2 <app>
+#|i| [ii][select][cursor][open] n=1 <:>
+#|i| [ii][select][cursor][open] n=1 <pp>
+#|i| exact: 2
+#|i| hits=2
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test    2017-04-13 11:50:06 +0900 (63cc4ac)
@@ -0,0 +1,26 @@
+#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes
+
+table_create Properties TABLE_NO_KEY
+column_create Properties content COLUMN_SCALAR ShortText
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --normalizer NormalizerAuto \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Properties content
+
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "appname:application1"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Properties \
+  --filter 'content @~ "app.*:.*pp.*"' \
+  --output_columns content,_score
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected (+55 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected    2017-04-13 11:50:06 +0900 (fccc399)
@@ -0,0 +1,55 @@
+table_create Properties TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Properties content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --normalizer NormalizerAuto   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Properties content
+[[0,0.0,0.0],true]
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:app:apple"},
+{"content": "appname:application1"}
+]
+[[0,0.0,0.0],4]
+log_level --level info
+[[0,0.0,0.0],true]
+select Properties   --filter 'content @~ "\\\\Aapp:.*pp.*"'   --output_columns content,_score
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "content",
+          "ShortText"
+        ],
+        [
+          "_score",
+          "Int32"
+        ]
+      ],
+      [
+        "app:apple",
+        1
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > (\Aapp:.*pp.*)
+#|i| [ii][select][cursor][open] n=3 <￯app:>
+#|i| [ii][select][cursor][open] n=1 <pp>
+#|i| exact: 1
+#|i| hits=1
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test    2017-04-13 11:50:06 +0900 (c3d5139)
@@ -0,0 +1,26 @@
+#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes
+
+table_create Properties TABLE_NO_KEY
+column_create Properties content COLUMN_SCALAR ShortText
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --normalizer NormalizerAuto \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Properties content
+
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:app:apple"},
+{"content": "appname:application1"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Properties \
+  --filter 'content @~ "\\\\Aapp:.*pp.*"' \
+  --output_columns content,_score
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected (+56 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected    2017-04-13 11:50:06 +0900 (722e8ff)
@@ -0,0 +1,56 @@
+table_create Properties TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Properties content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --normalizer NormalizerAuto   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Properties content
+[[0,0.0,0.0],true]
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "appname:application1"}
+]
+[[0,0.0,0.0],4]
+log_level --level info
+[[0,0.0,0.0],true]
+select Properties   --filter 'content @~ "\\\\Aapp:.*pp.*\\\\z"'   --output_columns content,_score
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "content",
+          "ShortText"
+        ],
+        [
+          "_score",
+          "Int32"
+        ]
+      ],
+      [
+        "app:apple",
+        1
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > (\Aapp:.*pp.*\z)
+#|i| [ii][select][cursor][open] n=3 <￯app:>
+#|i| [ii][select][cursor][open] n=1 <pp>
+#|i| [ii][select][cursor][open] n=1 <￰>
+#|i| exact: 1
+#|i| hits=1
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test    2017-04-13 11:50:06 +0900 (e5c77f2)
@@ -0,0 +1,26 @@
+#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes
+
+table_create Properties TABLE_NO_KEY
+column_create Properties content COLUMN_SCALAR ShortText
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --normalizer NormalizerAuto \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Properties content
+
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "appname:application1"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Properties \
+  --filter 'content @~ "\\\\Aapp:.*pp.*\\\\z"' \
+  --output_columns content,_score
+#@remove-important-log-levels info
+log_level --level notice

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected (+65 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected    2017-04-13 11:50:06 +0900 (88583f9)
@@ -0,0 +1,65 @@
+table_create Properties TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Properties content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create RegexpTokens TABLE_PAT_KEY ShortText   --normalizer NormalizerAuto   --default_tokenizer TokenRegexp
+[[0,0.0,0.0],true]
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION   Properties content
+[[0,0.0,0.0],true]
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "project:app:Groonga,app:Rroonga"},
+{"content": "appname:PGroonga"}
+]
+[[0,0.0,0.0],5]
+log_level --level info
+[[0,0.0,0.0],true]
+select Properties   --filter 'content @~ "app.*:.*ga\\\\z"'   --output_columns content,_score
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        3
+      ],
+      [
+        [
+          "content",
+          "ShortText"
+        ],
+        [
+          "_score",
+          "Int32"
+        ]
+      ],
+      [
+        "app:Groonga",
+        1
+      ],
+      [
+        "project:app:Groonga,app:Rroonga",
+        2
+      ],
+      [
+        "appname:PGroonga",
+        1
+      ]
+    ]
+  ]
+]
+#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
+#|i| grn_ii_sel > (app.*:.*ga\z)
+#|i| [ii][select][cursor][open] n=2 <app>
+#|i| [ii][select][cursor][open] n=1 <:>
+#|i| [ii][select][cursor][open] n=2 <ga￰>
+#|i| exact: 3
+#|i| hits=3
+log_level --level notice
+[[0,0.0,0.0],true]

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test (+27 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test    2017-04-13 11:50:06 +0900 (c7e6f69)
@@ -0,0 +1,27 @@
+#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes
+
+table_create Properties TABLE_NO_KEY
+column_create Properties content COLUMN_SCALAR ShortText
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --normalizer NormalizerAuto \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Properties content
+
+load --table Properties
+[
+{"content": "app:Groonga"},
+{"content": "app:apple"},
+{"content": "project:Groonga"},
+{"content": "project:app:Groonga,app:Rroonga"},
+{"content": "appname:PGroonga"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Properties \
+  --filter 'content @~ "app.*:.*ga\\\\z"' \
+  --output_columns content,_score
+#@remove-important-log-levels info
+log_level --level notice
-------------- next part --------------
HTML����������������������������...
下載 



More information about the Groonga-commit mailing list
Back to archive index