[Groonga-commit] groonga/groonga [master] grn_ii_buffer: calculate the size of packed buffer accurately.

Back to archive index

null+****@clear***** null+****@clear*****
2012年 2月 14日 (火) 18:38:20 JST


Daijiro MORI	2012-02-14 18:38:20 +0900 (Tue, 14 Feb 2012)

  New Revision: 2d42b586c8fb8c5f0c2f94128636da05328bfa6d

  Log:
    grn_ii_buffer: calculate the size of packed buffer accurately.

  Modified files:
    lib/ii.c

  Modified: lib/ii.c (+62 -32)
===================================================================
--- lib/ii.c    2012-02-14 14:14:33 +0900 (78df2ad)
+++ lib/ii.c    2012-02-14 18:38:20 +0900 (8c3ea14)
@@ -6343,11 +6343,12 @@ const int II_BUFFER_ORDER = GRN_CURSOR_BY_ID;
 #else /* II_BUFFER_ORDER_BY_ID */
 const int II_BUFFER_ORDER = GRN_CURSOR_BY_KEY;
 #endif /* II_BUFFER_ORDER_BY_ID */
-const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16300;
+//const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16380;
+const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 8190;
 const uint32_t II_BUFFER_PACKED_BUFFER_SIZE = 0x4000000;
 const char *TMPFILE_PATH = "grn_ii_buffer_tmp";
 const uint32_t II_BUFFER_NCOUNTERS_MARGIN = 0x100000;
-const size_t II_BUFFER_BLOCK_SIZE = 0x100000;
+const size_t II_BUFFER_BLOCK_SIZE = 0x1000000;
 const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000;
 
 typedef struct {
@@ -6399,6 +6400,7 @@ struct _grn_ii_buffer {
   buffer *term_buffer;
   datavec data_vectors[MAX_N_ELEMENTS + 1];
   uint8_t *packed;
+  uint32_t packed_buf_size;
   uint32_t packed_len;
   uint64_t total_chunk_size;
 };
@@ -6408,6 +6410,8 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
 {
   uint8_t *outbuf, *outbufp, *outbufp_;
   ii_buffer_block *block;
+  GRN_LOG(ctx, GRN_LOG_NOTICE, "flushing:%d npostings:%u",
+          ii_buffer->nblocks, ii_buffer->blockpos);
   outbuf = (uint8_t *)GRN_MALLOC(ii_buffer->blockpos * 7 * sizeof(uint32_t));
   /* if (!outbuf) { err } */
   outbufp_ = outbufp = outbuf;
@@ -6522,6 +6526,8 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
     ssize_t r = write(ii_buffer->tmpfd, outbuf, outbufp - outbuf);
     if (r > 0) { ii_buffer->filepos += r; }
     block->tail = ii_buffer->filepos;
+    GRN_LOG(ctx, GRN_LOG_NOTICE, "flushed: %d encoded_size:%jdKB",
+            ii_buffer->nblocks, r >> 10);
   }
   ii_buffer->nblocks++;
   GRN_FREE(outbuf);
@@ -6668,8 +6674,12 @@ grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer)
   ii_buffer->term_buffer->header.nterms_void = 0;
   buffer_segment_update(ii_buffer->ii, ii_buffer->lseg, ii_buffer->dseg);
   ii_buffer->ii->header->total_chunk_size += ii_buffer->packed_len;
-  ii_buffer->term_buffer = NULL;
   ii_buffer->total_chunk_size += ii_buffer->packed_len;
+  GRN_LOG(ctx, GRN_LOG_NOTICE, "nterms=%d chunk=%d total=%zuKB",
+          ii_buffer->term_buffer->header.nterms,
+          ii_buffer->term_buffer->header.chunk_size,
+          ii_buffer->ii->header->total_chunk_size >> 10);
+  ii_buffer->term_buffer = NULL;
   ii_buffer->packed = NULL;
   ii_buffer->packed_len = 0;
 }
@@ -6699,22 +6709,6 @@ grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
     uint64_t spos = 0;
     uint32_t nrecs = 0;
     uint32_t nposts = 0;
-    uint16_t nterm;
-    buffer_term *bt;
-    if (!ii_buffer->term_buffer) {
-      uint32_t lseg;
-      void *term_buffer;
-      for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) {
-        if (ii_buffer->ii->header->binfo[lseg] == NOT_ASSIGNED) { break; }
-      }
-      ii_buffer->lseg = lseg;
-      ii_buffer->dseg = segment_get(ctx, ii_buffer->ii);
-      GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer);
-      ii_buffer->term_buffer = (buffer *)term_buffer;
-    }
-    nterm = ii_buffer->term_buffer->header.nterms++;
-    bt = &ii_buffer->term_buffer->terms[nterm];
-    a[0] = SEG2POS(ii_buffer->lseg, (sizeof(buffer_header) + sizeof(buffer_term) * nterm));
     {
       int i;
       for (i = 0; i < nhits; i++) {
@@ -6758,24 +6752,60 @@ grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer,
       ii_buffer->data_vectors[1].data_size = nrecs;
       ii_buffer->data_vectors[2].data_size = nposts;
 
-      ii_buffer->data_vectors[0].flags = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
-      ii_buffer->data_vectors[1].flags = (nrecs < 3) ? 0 : USE_P_ENC;
+      ii_buffer->data_vectors[0].flags =
+        ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC;
+      ii_buffer->data_vectors[1].flags =
+        (nrecs < 3) ? 0 : USE_P_ENC;
       ii_buffer->data_vectors[2].flags =
         (((nposts < 32) || (nposts <= (spos >> 13))) ? 0 : USE_P_ENC)|ODD;
     }
-    if (!ii_buffer->packed) { ii_buffer->packed = GRN_MALLOC(II_BUFFER_PACKED_BUFFER_SIZE * 2); }
     {
-      int packed_len = grn_p_encv(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements,
-                                  ii_buffer->packed + ii_buffer->packed_len);
-      bt->tid = tid;
-      bt->size_in_buffer = 0;
-      bt->pos_in_buffer = 0;
-      bt->size_in_chunk = packed_len;
-      bt->pos_in_chunk = ii_buffer->packed_len;
-      ii_buffer->packed_len += packed_len;
+      uint32_t max_size = (nrecs * 2 + nposts);
+      if (ii_buffer->packed &&
+          ii_buffer->packed_buf_size <
+          ii_buffer->packed_len + max_size) {
+        grn_ii_buffer_chunk_flush(ctx, ii_buffer);
+      }
+      if (!ii_buffer->packed) {
+        uint32_t buf_size = (max_size > II_BUFFER_PACKED_BUFFER_SIZE)
+          ? max_size : II_BUFFER_PACKED_BUFFER_SIZE;
+        if ((ii_buffer->packed = GRN_MALLOC(buf_size))) {
+          ii_buffer->packed_buf_size = buf_size;
+        }
+      }
     }
-    if (nterm == II_BUFFER_NTERMS_PER_BUFFER || ii_buffer->packed_len > II_BUFFER_PACKED_BUFFER_SIZE) {
-      grn_ii_buffer_chunk_flush(ctx, ii_buffer);
+    {
+      uint16_t nterm;
+      buffer_term *bt;
+      if (!ii_buffer->term_buffer) {
+        uint32_t lseg;
+        void *term_buffer;
+        for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) {
+          if (ii_buffer->ii->header->binfo[lseg] == NOT_ASSIGNED) { break; }
+        }
+        ii_buffer->lseg = lseg;
+        ii_buffer->dseg = segment_get(ctx, ii_buffer->ii);
+        GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer);
+        ii_buffer->term_buffer = (buffer *)term_buffer;
+      }
+      nterm = ii_buffer->term_buffer->header.nterms++;
+      bt = &ii_buffer->term_buffer->terms[nterm];
+      a[0] = SEG2POS(ii_buffer->lseg,
+                     (sizeof(buffer_header) + sizeof(buffer_term) * nterm));
+      {
+        int packed_len = grn_p_encv(ctx, ii_buffer->data_vectors,
+                                    ii_buffer->ii->n_elements,
+                                    ii_buffer->packed + ii_buffer->packed_len);
+        bt->tid = tid;
+        bt->size_in_buffer = 0;
+        bt->pos_in_buffer = 0;
+        bt->size_in_chunk = packed_len;
+        bt->pos_in_chunk = ii_buffer->packed_len;
+        ii_buffer->packed_len += packed_len;
+      }
+      if (ii_buffer->term_buffer->header.nterms == II_BUFFER_NTERMS_PER_BUFFER) {
+        grn_ii_buffer_chunk_flush(ctx, ii_buffer);
+      }
     }
   }
 }




Groonga-commit メーリングリストの案内
Back to archive index