Kouhei Sutou
null+****@clear*****
Thu Apr 13 11:50:06 JST 2017
Kouhei Sutou 2017-04-13 11:50:06 +0900 (Thu, 13 Apr 2017) New Revision: 9d06ca06a872700d361c1fbecd0b71bc2daafa82 https://github.com/groonga/groonga/commit/9d06ca06a872700d361c1fbecd0b71bc2daafa82 Message: ii regexp: support multiple ".*" in one pattern Added files: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test Modified files: lib/ii.c Modified: lib/ii.c (+62 -47) =================================================================== --- lib/ii.c 2017-04-13 10:10:19 +0900 (2dcdeeb) +++ lib/ii.c 2017-04-13 11:50:06 +0900 (284f472) @@ -7712,9 +7712,11 @@ typedef struct { const char *string; unsigned int string_len; grn_bool done; + grn_ii_select_cursor_posting unshifted_posting; + grn_bool have_unshifted_posting; } grn_ii_select_cursor; -grn_rc +static grn_rc grn_ii_select_cursor_close(grn_ctx *ctx, grn_ii_select_cursor *cursor) { @@ -7738,7 +7740,7 @@ grn_ii_select_cursor_close(grn_ctx *ctx, return GRN_SUCCESS; } -grn_ii_select_cursor * +static grn_ii_select_cursor * grn_ii_select_cursor_open(grn_ctx *ctx, grn_ii *ii, const char *string, @@ -7845,10 +7847,12 @@ grn_ii_select_cursor_open(grn_ctx *ctx, cursor->done = GRN_FALSE; + cursor->have_unshifted_posting = GRN_FALSE; + return cursor; } -grn_ii_select_cursor_posting * +static grn_ii_select_cursor_posting * grn_ii_select_cursor_next(grn_ctx *ctx, grn_ii_select_cursor *cursor) { @@ -7859,6 +7863,11 @@ grn_ii_select_cursor_next(grn_ctx *ctx, int max_interval = cursor->max_interval; grn_operator mode = cursor->mode; + if (cursor->have_unshifted_posting) { + cursor->have_unshifted_posting = GRN_FALSE; + return &(cursor->unshifted_posting); + } + if (cursor->done) { return NULL; } @@ -7885,29 +7894,48 @@ grn_ii_select_cursor_next(grn_ctx *ctx, } if (tip == tie) { - int n_occurs = 0; int start_pos = 0; + int pos = 0; int end_pos = 0; int score = 0; + int tf = 0; int tscore = 0; #define SKIP_OR_BREAK(pos) {\ - if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ + if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ if (ti->p->rid != rid || ti->p->sid != sid) { \ next_rid = ti->p->rid; \ next_sid = ti->p->sid; \ break; \ } \ } + +#define RETURN_POSTING() do { \ + cursor->posting.rid = rid; \ + cursor->posting.sid = sid; \ + cursor->posting.start_pos = start_pos; \ + cursor->posting.end_pos = end_pos; \ + cursor->posting.tf = tf; \ + cursor->posting.weight = tscore; \ + if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \ + if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \ + cursor->done = GRN_TRUE; \ + } \ + } \ + return &(cursor->posting); \ +} while (GRN_FALSE) + if (n_tis == 1) { - n_occurs = (*tis)->p->tf; + start_pos = pos = end_pos = (*tis)->p->pos; + pos++; + tf = (*tis)->p->tf; tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; - start_pos = end_pos = (*tis)->p->pos; + RETURN_POSTING(); } else if (mode == GRN_OP_NEAR) { bt_zap(bt); for (tip = tis; tip < tie; tip++) { token_info *ti = *tip; - SKIP_OR_BREAK(end_pos); + SKIP_OR_BREAK(pos); bt_push(bt, ti); } if (tip == tie) { @@ -7937,7 +7965,8 @@ grn_ii_select_cursor_next(grn_ctx *ctx, return NULL; } if ((max_interval < 0) || (max - min <= max_interval)) { - n_occurs++; + /* TODO: Set start_pos, pos, end_pos, tf and tscore */ + RETURN_POSTING(); if (ti->pos == max + 1) { break; } @@ -7958,41 +7987,27 @@ grn_ii_select_cursor_next(grn_ctx *ctx, if (tip == tie) { tip = tis; } ti = *tip; - SKIP_OR_BREAK(end_pos); - if (ti->pos == end_pos) { + SKIP_OR_BREAK(pos); + if (ti->pos == pos) { score += ti->p->weight + ti->cursors->bins[0]->weight; count++; } else { score = ti->p->weight + ti->cursors->bins[0]->weight; count = 1; - if (start_pos == 0) { - start_pos = ti->pos; - } - end_pos = ti->pos; + start_pos = pos = ti->pos; + end_pos = ti->p->pos; } if (count == n_tis) { + pos++; + if (ti->p->pos > end_pos) { + end_pos = ti->p->pos; + } + tf = 1; tscore += score; - score = 0; - count = 0; - end_pos++; - n_occurs++; + RETURN_POSTING(); } } } - if (n_occurs > 0) { - cursor->posting.rid = rid; - cursor->posting.sid = sid; - cursor->posting.start_pos = start_pos; - cursor->posting.end_pos = end_pos; - cursor->posting.tf = n_occurs; - cursor->posting.weight = tscore; - if (token_info_skip_pos(ctx, *tis, rid, sid, end_pos + 1) != GRN_SUCCESS) { - if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { - cursor->done = GRN_TRUE; - } - } - return &(cursor->posting); - } #undef SKIP_OR_BREAK } if (token_info_skip(ctx, *tis, next_rid, next_sid)) { @@ -8001,6 +8016,15 @@ grn_ii_select_cursor_next(grn_ctx *ctx, } } +static void +grn_ii_select_cursor_unshift(grn_ctx *ctx, + grn_ii_select_cursor *cursor, + grn_ii_select_cursor_posting *posting) +{ + cursor->unshifted_posting = *posting; + cursor->have_unshifted_posting = GRN_TRUE; +} + static grn_rc grn_ii_parse_regexp_query(grn_ctx *ctx, const char *log_tag, @@ -8131,8 +8155,6 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, int i; grn_ii_select_cursor **cursors; grn_bool have_error = GRN_FALSE; - int keep_i = 0; - grn_ii_select_cursor_posting keep_posting; cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings); for (i = 0; i < n_parsed_strings; i++) { @@ -8169,26 +8191,19 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, grn_ii_select_cursor_posting *posting_i; for (;;) { - if (keep_i == i) { - posting_i = &keep_posting; - keep_i = 0; - } else { - posting_i = grn_ii_select_cursor_next(ctx, cursors[i]); - if (!posting_i) { - break; - } + posting_i = grn_ii_select_cursor_next(ctx, cursors[i]); + if (!posting_i) { + break; } if (posting_i->rid == posting->rid && posting_i->sid == posting->sid && posting_i->start_pos > pos) { - keep_i = i; - keep_posting = *posting_i; + grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); break; } if (posting_i->rid > posting->rid) { - keep_i = i; - keep_posting = *posting_i; + grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); break; } } Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected (+60 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.expected 2017-04-13 11:50:06 +0900 (791c90e) @@ -0,0 +1,60 @@ +table_create Properties TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Properties content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content +[[0,0.0,0.0],true] +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "appname:application1"} +] +[[0,0.0,0.0],4] +log_level --level info +[[0,0.0,0.0],true] +select Properties --filter 'content @~ "app.*:.*pp.*"' --output_columns content,_score +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "content", + "ShortText" + ], + [ + "_score", + "Int32" + ] + ], + [ + "app:apple", + 1 + ], + [ + "appname:application1", + 1 + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > (app.*:.*pp.*) +#|i| [ii][select][cursor][open] n=2 <app> +#|i| [ii][select][cursor][open] n=1 <:> +#|i| [ii][select][cursor][open] n=1 <pp> +#|i| exact: 2 +#|i| hits=2 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple.test 2017-04-13 11:50:06 +0900 (63cc4ac) @@ -0,0 +1,26 @@ +#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes + +table_create Properties TABLE_NO_KEY +column_create Properties content COLUMN_SCALAR ShortText + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --normalizer NormalizerAuto \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Properties content + +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "appname:application1"} +] + +log_level --level info +#@add-important-log-levels info +select Properties \ + --filter 'content @~ "app.*:.*pp.*"' \ + --output_columns content,_score +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected (+55 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.expected 2017-04-13 11:50:06 +0900 (fccc399) @@ -0,0 +1,55 @@ +table_create Properties TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Properties content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content +[[0,0.0,0.0],true] +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:app:apple"}, +{"content": "appname:application1"} +] +[[0,0.0,0.0],4] +log_level --level info +[[0,0.0,0.0],true] +select Properties --filter 'content @~ "\\\\Aapp:.*pp.*"' --output_columns content,_score +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "content", + "ShortText" + ], + [ + "_score", + "Int32" + ] + ], + [ + "app:apple", + 1 + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > (\Aapp:.*pp.*) +#|i| [ii][select][cursor][open] n=3 <app:> +#|i| [ii][select][cursor][open] n=1 <pp> +#|i| exact: 1 +#|i| hits=1 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin.test 2017-04-13 11:50:06 +0900 (c3d5139) @@ -0,0 +1,26 @@ +#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes + +table_create Properties TABLE_NO_KEY +column_create Properties content COLUMN_SCALAR ShortText + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --normalizer NormalizerAuto \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Properties content + +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:app:apple"}, +{"content": "appname:application1"} +] + +log_level --level info +#@add-important-log-levels info +select Properties \ + --filter 'content @~ "\\\\Aapp:.*pp.*"' \ + --output_columns content,_score +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected (+56 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.expected 2017-04-13 11:50:06 +0900 (722e8ff) @@ -0,0 +1,56 @@ +table_create Properties TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Properties content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content +[[0,0.0,0.0],true] +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "appname:application1"} +] +[[0,0.0,0.0],4] +log_level --level info +[[0,0.0,0.0],true] +select Properties --filter 'content @~ "\\\\Aapp:.*pp.*\\\\z"' --output_columns content,_score +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "content", + "ShortText" + ], + [ + "_score", + "Int32" + ] + ], + [ + "app:apple", + 1 + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > (\Aapp:.*pp.*\z) +#|i| [ii][select][cursor][open] n=3 <app:> +#|i| [ii][select][cursor][open] n=1 <pp> +#|i| [ii][select][cursor][open] n=1 <> +#|i| exact: 1 +#|i| hits=1 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test (+26 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_begin_end.test 2017-04-13 11:50:06 +0900 (e5c77f2) @@ -0,0 +1,26 @@ +#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes + +table_create Properties TABLE_NO_KEY +column_create Properties content COLUMN_SCALAR ShortText + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --normalizer NormalizerAuto \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Properties content + +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "appname:application1"} +] + +log_level --level info +#@add-important-log-levels info +select Properties \ + --filter 'content @~ "\\\\Aapp:.*pp.*\\\\z"' \ + --output_columns content,_score +#@remove-important-log-levels info +log_level --level notice Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected (+65 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.expected 2017-04-13 11:50:06 +0900 (88583f9) @@ -0,0 +1,65 @@ +table_create Properties TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Properties content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp +[[0,0.0,0.0],true] +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content +[[0,0.0,0.0],true] +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "project:app:Groonga,app:Rroonga"}, +{"content": "appname:PGroonga"} +] +[[0,0.0,0.0],5] +log_level --level info +[[0,0.0,0.0],true] +select Properties --filter 'content @~ "app.*:.*ga\\\\z"' --output_columns content,_score +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "content", + "ShortText" + ], + [ + "_score", + "Int32" + ] + ], + [ + "app:Groonga", + 1 + ], + [ + "project:app:Groonga,app:Rroonga", + 2 + ], + [ + "appname:PGroonga", + 1 + ] + ] + ] +] +#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content> +#|i| grn_ii_sel > (app.*:.*ga\z) +#|i| [ii][select][cursor][open] n=2 <app> +#|i| [ii][select][cursor][open] n=1 <:> +#|i| [ii][select][cursor][open] n=2 <ga> +#|i| exact: 3 +#|i| hits=3 +log_level --level notice +[[0,0.0,0.0],true] Added: test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test (+27 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/filter/index/regexp/dot_asterisk/multiple_end.test 2017-04-13 11:50:06 +0900 (c7e6f69) @@ -0,0 +1,27 @@ +#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes + +table_create Properties TABLE_NO_KEY +column_create Properties content COLUMN_SCALAR ShortText + +table_create RegexpTokens TABLE_PAT_KEY ShortText \ + --normalizer NormalizerAuto \ + --default_tokenizer TokenRegexp +column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \ + Properties content + +load --table Properties +[ +{"content": "app:Groonga"}, +{"content": "app:apple"}, +{"content": "project:Groonga"}, +{"content": "project:app:Groonga,app:Rroonga"}, +{"content": "appname:PGroonga"} +] + +log_level --level info +#@add-important-log-levels info +select Properties \ + --filter 'content @~ "app.*:.*ga\\\\z"' \ + --output_columns content,_score +#@remove-important-log-levels info +log_level --level notice -------------- next part -------------- HTML����������������������������... 下載