[Groonga-commit] groonga/groonga [master] [benchmark][normalize] added bundle with change patterns.

Back to archive index

null+****@clear***** null+****@clear*****
2012年 2月 9日 (木) 18:07:36 JST


Kouhei Sutou	2012-02-09 18:07:36 +0900 (Thu, 09 Feb 2012)

  New Revision: 202f6f64ffa9ce1777cb9376f63f4e51cea80604

  Log:
    [benchmark][normalize] added bundle with change patterns.

  Modified files:
    test/benchmark/Makefile.am
    test/benchmark/bench-normalize.c

  Modified: test/benchmark/Makefile.am (+6 -0)
===================================================================
--- test/benchmark/Makefile.am    2012-02-09 17:21:16 +0900 (453c034)
+++ test/benchmark/Makefile.am    2012-02-09 18:07:36 +0900 (732688b)
@@ -57,4 +57,10 @@ run-bench-geo-select: bench-geo-select
 	  srcdir="$(srcdir)"			\
 	  $(srcdir)/bench-geo-select.sh
 
+run-bench-normalize: bench-normalize
+	@echo $@:
+	env						\
+	  GRN_PLUGINS_DIR="$(top_builddir)/plugins"	\
+	  ./bench-normalize
+
 benchmark: $(benchmarks)

  Modified: test/benchmark/bench-normalize.c (+488 -11)
===================================================================
--- test/benchmark/bench-normalize.c    2012-02-09 17:21:16 +0900 (268c1d0)
+++ test/benchmark/bench-normalize.c    2012-02-09 18:07:36 +0900 (5b989d8)
@@ -17,15 +17,20 @@
 */
 
 /*
-  groonga: 5632ce3e39c0d8bf3c6b758d4cbf5e012cfa00b0
+  groonga: b85246df11a3dbedcae736b4879ba4daa8389116
   CFLAGS: -O3
   CPU: Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz
-  % make -j8 > /dev/null && GROONGA_BENCH_N=10000 test/benchmark/bench-normalize
-
-  groonga: 5632ce3e39c0d8bf3c6b758d4cbf5e012cfa00b0
-  CFLAGS: -O3 -ggdb3
+  % make -j8 > /dev/null && (cd test/benchmark && GROONGA_BENCH_N=10000 make run-bench-normalize)
+                       (time)
+    1st: NFKC: plugin: (1.91418)
+    2nd: NFKC: plugin: (1.89913)
+    1st: NFKC: bundle: (1.94809)
+    2nd: NFKC: bundle: (2.01457)
+
+  groonga: b85246df11a3dbedcae736b4879ba4daa8389116
+  CFLAGS: -O0 -ggdb3
   CPU: Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz
-  % make -j8 > /dev/null && GROONGA_BENCH_N=10000 test/benchmark/bench-normalize
+  % make -j8 > /dev/null && (cd test/benchmark && GROONGA_BENCH_N=10000 make run-bench-normalize)
                   (time)
     NFKC: plugin: (3.05917)
     NFKC: bundle: (3.13312)
@@ -63,7 +68,7 @@ enum {
 };
 
 inline static grn_obj *
-utf8_nfkc_normalize(grn_ctx *ctx, grn_str *nstr)
+utf8_nfkc_normalize_original(grn_ctx *ctx, grn_str *nstr)
 {
   int16_t *ch;
   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
@@ -199,6 +204,419 @@ utf8_nfkc_normalize(grn_ctx *ctx, grn_str *nstr)
   return NULL;
 }
 
+inline static grn_obj *
+utf8_nfkc_normalize_short(grn_ctx *ctx, grn_str *nstr)
+{
+  short *ch;
+  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+  unsigned char *d, *d_, *de;
+  uint_least8_t *cp;
+  size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
+  int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+  if (!(nstr->norm = GRN_MALLOC(ds + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[normalizer][utf8][nfkc] failed to allocate normalized text space");
+    return NULL;
+  }
+  if (nstr->flags & GRN_STR_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(ds * sizeof(short) + 1))) {
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STR_WITH_CTYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
+      if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = nstr->ctypes;
+  d = (unsigned char *)nstr->norm;
+  de = d + ds;
+  d_ = NULL;
+  e = (unsigned char *)nstr->orig + size;
+  for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
+    if (!(ls = grn_charlen_utf8(ctx, s, e))) {
+      break;
+    }
+    if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+      pe = p + strlen((char *)p);
+    } else {
+      p = s;
+      pe = p + ls;
+    }
+    if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+      p = p2;
+      pe = p + strlen((char *)p);
+      if (cp) { cp--; }
+      if (ch) {
+        ch -= (d - d_);
+        s_ = s__;
+      }
+      d = d_;
+      length--;
+    }
+    for (; ; p += lp) {
+      if (!(lp = grn_charlen_utf8(ctx, p, pe))) {
+        break;
+      }
+      if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
+        if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+      } else {
+        if (de <= d + lp) {
+          unsigned char *norm;
+          ds += (ds >> 1) + lp;
+          if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) {
+            if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+            if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+            GRN_FREE(nstr->norm); nstr->norm = NULL;
+            ERR(GRN_NO_MEMORY_AVAILABLE,
+                "[normalizer][utf8][nfkc] "
+                "failed to reallocate normalized text space");
+            return NULL;
+          }
+          de = norm + ds;
+          d = norm + (d - (unsigned char *)nstr->norm);
+          nstr->norm = norm;
+          if (ch) {
+            short *checks;
+            if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(short)+ 1))) {
+              if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+              GRN_FREE(nstr->checks); nstr->checks = NULL;
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate checks space");
+              return NULL;
+            }
+            ch = checks + (ch - nstr->checks);
+            nstr->checks = checks;
+          }
+          if (cp) {
+            uint_least8_t *ctypes;
+            if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
+              GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
+              if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate character types space");
+              return NULL;
+            }
+            cp = ctypes + (cp - nstr->ctypes);
+            nstr->ctypes = ctypes;
+          }
+        }
+        memcpy(d, p, lp);
+        d_ = d;
+        d += lp;
+        length++;
+        if (cp) { *cp++ = grn_nfkc_ctype(p); }
+        if (ch) {
+          size_t i;
+          if (s_ == s + ls) {
+            *ch++ = -1;
+          } else {
+            *ch++ = (short)(s + ls - s_);
+            s__ = s_;
+            s_ = s + ls;
+          }
+          for (i = lp; i > 1; i--) { *ch++ = 0; }
+        }
+      }
+    }
+  }
+  if (cp) { *cp = grn_str_null; }
+  *d = '\0';
+  nstr->length = length;
+  nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+  return NULL;
+}
+
+inline static grn_obj *
+utf8_nfkc_normalize_unsigned_char(grn_ctx *ctx, grn_str *nstr)
+{
+  int16_t *ch;
+  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+  unsigned char *d, *d_, *de;
+  unsigned char *cp;
+  size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
+  int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+  if (!(nstr->norm = GRN_MALLOC(ds + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[normalizer][utf8][nfkc] failed to allocate normalized text space");
+    return NULL;
+  }
+  if (nstr->flags & GRN_STR_WITH_CHECKS) {
+    if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = nstr->checks;
+  if (nstr->flags & GRN_STR_WITH_CTYPES) {
+    if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
+      if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = nstr->ctypes;
+  d = (unsigned char *)nstr->norm;
+  de = d + ds;
+  d_ = NULL;
+  e = (unsigned char *)nstr->orig + size;
+  for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
+    if (!(ls = grn_charlen_utf8(ctx, s, e))) {
+      break;
+    }
+    if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+      pe = p + strlen((char *)p);
+    } else {
+      p = s;
+      pe = p + ls;
+    }
+    if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+      p = p2;
+      pe = p + strlen((char *)p);
+      if (cp) { cp--; }
+      if (ch) {
+        ch -= (d - d_);
+        s_ = s__;
+      }
+      d = d_;
+      length--;
+    }
+    for (; ; p += lp) {
+      if (!(lp = grn_charlen_utf8(ctx, p, pe))) {
+        break;
+      }
+      if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
+        if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+      } else {
+        if (de <= d + lp) {
+          unsigned char *norm;
+          ds += (ds >> 1) + lp;
+          if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) {
+            if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+            if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+            GRN_FREE(nstr->norm); nstr->norm = NULL;
+            ERR(GRN_NO_MEMORY_AVAILABLE,
+                "[normalizer][utf8][nfkc] "
+                "failed to reallocate normalized text space");
+            return NULL;
+          }
+          de = norm + ds;
+          d = norm + (d - (unsigned char *)nstr->norm);
+          nstr->norm = norm;
+          if (ch) {
+            int16_t *checks;
+            if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
+              if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+              GRN_FREE(nstr->checks); nstr->checks = NULL;
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate checks space");
+              return NULL;
+            }
+            ch = checks + (ch - nstr->checks);
+            nstr->checks = checks;
+          }
+          if (cp) {
+            unsigned char *ctypes;
+            if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
+              GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
+              if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate character types space");
+              return NULL;
+            }
+            cp = ctypes + (cp - nstr->ctypes);
+            nstr->ctypes = ctypes;
+          }
+        }
+        memcpy(d, p, lp);
+        d_ = d;
+        d += lp;
+        length++;
+        if (cp) { *cp++ = grn_nfkc_ctype(p); }
+        if (ch) {
+          size_t i;
+          if (s_ == s + ls) {
+            *ch++ = -1;
+          } else {
+            *ch++ = (int16_t)(s + ls - s_);
+            s__ = s_;
+            s_ = s + ls;
+          }
+          for (i = lp; i > 1; i--) { *ch++ = 0; }
+        }
+      }
+    }
+  }
+  if (cp) { *cp = grn_str_null; }
+  *d = '\0';
+  nstr->length = length;
+  nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+  return NULL;
+}
+
+inline static grn_obj *
+utf8_nfkc_normalize_local(grn_ctx *ctx, grn_str *nstr)
+{
+  int16_t *checks = NULL, *ch;
+  const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+  unsigned char *d, *d_, *de;
+  uint_least8_t *ctypes = NULL, *cp;
+  size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
+  int removeblankp = nstr->flags & GRN_STR_REMOVEBLANK;
+  if (!(nstr->norm = GRN_MALLOC(ds + 1))) {
+    ERR(GRN_NO_MEMORY_AVAILABLE,
+        "[normalizer][utf8][nfkc] failed to allocate normalized text space");
+    return NULL;
+  }
+  if (nstr->flags & GRN_STR_WITH_CHECKS) {
+    if (!(checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate checks space");
+      return NULL;
+    }
+  }
+  ch = checks;
+  if (nstr->flags & GRN_STR_WITH_CTYPES) {
+    if (!(ctypes = GRN_MALLOC(ds + 1))) {
+      if (checks) { GRN_FREE(checks); }
+      GRN_FREE(nstr->norm);
+      nstr->norm = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[normalizer][utf8][nfkc] failed to allocate character types space");
+      return NULL;
+    }
+  }
+  cp = ctypes;
+  d = (unsigned char *)nstr->norm;
+  de = d + ds;
+  d_ = NULL;
+  e = (unsigned char *)nstr->orig + size;
+  for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
+    if (!(ls = grn_charlen_utf8(ctx, s, e))) {
+      break;
+    }
+    if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+      pe = p + strlen((char *)p);
+    } else {
+      p = s;
+      pe = p + ls;
+    }
+    if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+      p = p2;
+      pe = p + strlen((char *)p);
+      if (cp) { cp--; }
+      if (ch) {
+        ch -= (d - d_);
+        s_ = s__;
+      }
+      d = d_;
+      length--;
+    }
+    for (; ; p += lp) {
+      if (!(lp = grn_charlen_utf8(ctx, p, pe))) {
+        break;
+      }
+      if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
+        if (cp > ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+      } else {
+        if (de <= d + lp) {
+          unsigned char *norm;
+          ds += (ds >> 1) + lp;
+          if (!(norm = GRN_REALLOC(nstr->norm, ds + 1))) {
+            if (ctypes) { GRN_FREE(ctypes); }
+            if (checks) { GRN_FREE(checks); }
+            GRN_FREE(nstr->norm); nstr->norm = NULL;
+            ERR(GRN_NO_MEMORY_AVAILABLE,
+                "[normalizer][utf8][nfkc] "
+                "failed to reallocate normalized text space");
+            return NULL;
+          }
+          de = norm + ds;
+          d = norm + (d - (unsigned char *)nstr->norm);
+          nstr->norm = norm;
+          if (ch) {
+            int16_t *new_checks;
+            if (!(new_checks = GRN_REALLOC(checks, ds * sizeof(int16_t)+ 1))) {
+              if (ctypes) { GRN_FREE(ctypes); }
+              GRN_FREE(checks);
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate checks space");
+              return NULL;
+            }
+            ch = new_checks + (ch - checks);
+            checks = new_checks;
+          }
+          if (cp) {
+            uint_least8_t *new_ctypes;
+            if (!(new_ctypes = GRN_REALLOC(ctypes, ds + 1))) {
+              GRN_FREE(ctypes);
+              if (checks) { GRN_FREE(checks); }
+              GRN_FREE(nstr->norm); nstr->norm = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[normalizer][utf8][nfkc] "
+                  "failed to reallocate character types space");
+              return NULL;
+            }
+            cp = new_ctypes + (cp - ctypes);
+            ctypes = new_ctypes;
+          }
+        }
+        memcpy(d, p, lp);
+        d_ = d;
+        d += lp;
+        length++;
+        if (cp) { *cp++ = grn_nfkc_ctype(p); }
+        if (ch) {
+          size_t i;
+          if (s_ == s + ls) {
+            *ch++ = -1;
+          } else {
+            *ch++ = (int16_t)(s + ls - s_);
+            s__ = s_;
+            s_ = s + ls;
+          }
+          for (i = lp; i > 1; i--) { *ch++ = 0; }
+        }
+      }
+    }
+  }
+  if (cp) { *cp = grn_str_null; }
+  *d = '\0';
+  nstr->length = length;
+  nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
+  nstr->checks = checks;
+  nstr->ctypes = ctypes;
+  return NULL;
+}
+
 #include "lib/benchmark.h"
 
 #define GET(context, name) (grn_ctx_get(context, name, strlen(name)))
@@ -286,7 +704,58 @@ bench_plugin(gpointer user_data)
 }
 
 static void
-bench_bundle(gpointer user_data)
+bench_bundle_original(gpointer user_data)
+{
+  BenchmarkData *data = user_data;
+  grn_ctx *ctx = data->context;
+  data->nstr = GRN_MALLOC(sizeof(grn_str));
+  data->nstr->orig = text;
+  data->nstr->orig_blen = text_length;
+  data->nstr->checks = NULL;
+  data->nstr->ctypes = NULL;
+  data->nstr->flags =
+    GRN_STR_REMOVEBLANK |
+    GRN_STR_WITH_TYPES |
+    GRN_STR_WITH_CHECKS;
+  utf8_nfkc_normalize_original(data->context, data->nstr);
+}
+
+static void
+bench_bundle_short(gpointer user_data)
+{
+  BenchmarkData *data = user_data;
+  grn_ctx *ctx = data->context;
+  data->nstr = GRN_MALLOC(sizeof(grn_str));
+  data->nstr->orig = text;
+  data->nstr->orig_blen = text_length;
+  data->nstr->checks = NULL;
+  data->nstr->ctypes = NULL;
+  data->nstr->flags =
+    GRN_STR_REMOVEBLANK |
+    GRN_STR_WITH_TYPES |
+    GRN_STR_WITH_CHECKS;
+  utf8_nfkc_normalize_short(data->context, data->nstr);
+}
+
+static void
+bench_bundle_unsigned_char(gpointer user_data)
+{
+  BenchmarkData *data = user_data;
+  grn_ctx *ctx = data->context;
+  data->nstr = GRN_MALLOC(sizeof(grn_str));
+  data->nstr->orig = text;
+  data->nstr->orig_blen = text_length;
+  data->nstr->checks = NULL;
+  data->nstr->ctypes = NULL;
+  data->nstr->flags =
+    GRN_STR_REMOVEBLANK |
+    GRN_STR_WITH_TYPES |
+    GRN_STR_WITH_CHECKS;
+  utf8_nfkc_normalize_unsigned_char(data->context, data->nstr);
+}
+
+static void
+bench_bundle_local(gpointer user_data)
 {
   BenchmarkData *data = user_data;
   grn_ctx *ctx = data->context;
@@ -299,7 +768,7 @@ bench_bundle(gpointer user_data)
     GRN_STR_REMOVEBLANK |
     GRN_STR_WITH_TYPES |
     GRN_STR_WITH_CHECKS;
-  utf8_nfkc_normalize(data->context, data->nstr);
+  utf8_nfkc_normalize_local(data->context, data->nstr);
 }
 
 static void
@@ -383,8 +852,16 @@ main(int argc, gchar **argv)
                           bench_ ## type,                               \
                           bench_teardown,                               \
                           &data)
-  REGISTER("NFKC: plugin", plugin);
-  REGISTER("NFKC: bundle", bundle);
+  REGISTER("1st: NFKC: plugin                ", plugin);
+  REGISTER("2nd: NFKC: plugin                ", plugin);
+  REGISTER("1st: NFKC: bundle (original)     ", bundle_original);
+  REGISTER("2nd: NFKC: bundle (original)     ", bundle_original);
+  REGISTER("1st: NFKC: bundle (short)        ", bundle_short);
+  REGISTER("2nd: NFKC: bundle (short)        ", bundle_short);
+  REGISTER("1st: NFKC: bundle (unsigned char)", bundle_unsigned_char);
+  REGISTER("2nd: NFKC: bundle (unsigned char)", bundle_unsigned_char);
+  REGISTER("1st: NFKC: bundle (local)        ", bundle_local);
+  REGISTER("2nd: NFKC: bundle (local)        ", bundle_local);
 #undef REGISTER
 
   bench_reporter_run(reporter);




Groonga-commit メーリングリストの案内
Back to archive index