[Groonga-commit] groonga/groonga at b8f1692 [ii-similar-search-use-tf-idf-like-score] ii: use TF-IDF like score for similar search

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Apr 25 17:54:21 JST 2017


Kouhei Sutou	2017-04-25 17:54:21 +0900 (Tue, 25 Apr 2017)

  New Revision: b8f16925caffb473f052380c31b6f392e663635d
  https://github.com/groonga/groonga/commit/b8f16925caffb473f052380c31b6f392e663635d

  Message:
    ii: use TF-IDF like score for similar search

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+14 -5)
===================================================================
--- lib/ii.c    2017-04-25 14:59:11 +0900 (be51cb8)
+++ lib/ii.c    2017-04-25 17:54:21 +0900 (e20252f)
@@ -7460,7 +7460,8 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
                       const char *string, unsigned int string_len,
                       grn_hash *s, grn_operator op, grn_select_optarg *optarg)
 {
-  int *w1, limit;
+  double *w1;
+  int limit;
   grn_id tid, *tp, max_size;
   grn_rc rc = GRN_SUCCESS;
   grn_hash *h;
@@ -7470,7 +7471,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
   if (!lexicon || !ii || !string || !string_len || !s || !optarg) {
     return GRN_INVALID_ARGUMENT;
   }
-  if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) {
+  if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(double), 0))) {
     return GRN_NO_MEMORY_AVAILABLE;
   }
   if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len,
@@ -7478,12 +7479,20 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
     grn_hash_close(ctx, h);
     return GRN_NO_MEMORY_AVAILABLE;
   }
-  if (!(max_size = optarg->max_size)) { max_size = 1048576; }
+  if (!(max_size = optarg->max_size)) {
+    grn_obj *source_table;
+    source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, (grn_obj *)ii));
+    if (source_table) {
+      max_size = grn_table_size(ctx, source_table);
+    } else {
+      max_size = 1048576;
+    }
+  }
   while (token_cursor->status != GRN_TOKEN_CURSOR_DONE &&
          token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) {
     if ((tid = grn_token_cursor_next(ctx, token_cursor))) {
       if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) {
-        (*w1)++;
+        (*w1) += 1;
       }
     }
     if (tid && token_cursor->curr_size) {
@@ -7513,7 +7522,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
       uint32_t es;
       grn_hash_cursor_get_key_value(ctx, c, (void **) &tp, NULL, (void **) &w1);
       if ((es = grn_ii_estimate_size(ctx, ii, *tp))) {
-        *w1 += max_size / es;
+        *w1 *= (double)es / (double)max_size;
       } else {
         grn_hash_cursor_delete(ctx, c, NULL);
       }
-------------- next part --------------
HTML����������������������������...
下載 



More information about the Groonga-commit mailing list
Back to archive index