[Groonga-commit] groonga/groonga [master] add TokenKytea.

Back to archive index

null+****@clear***** null+****@clear*****
2012年 2月 17日 (金) 17:05:22 JST


Susumu Yata	2012-02-17 17:05:22 +0900 (Fri, 17 Feb 2012)

  New Revision: 38799bac4d5a55c0ce820a3daaef89b03299edb3

  Log:
    add TokenKytea.

  Added files:
    plugins/tokenizers/kytea.cpp
  Modified files:
    plugins/tokenizers/Makefile.am

  Modified: plugins/tokenizers/Makefile.am (+8 -0)
===================================================================
--- plugins/tokenizers/Makefile.am    2012-02-16 16:40:42 +0900 (898db4b)
+++ plugins/tokenizers/Makefile.am    2012-02-17 17:05:22 +0900 (b94c035)
@@ -15,8 +15,16 @@ tokenizer_plugins_LTLIBRARIES =
 if WITH_MECAB
 tokenizer_plugins_LTLIBRARIES += mecab.la
 endif
+if WITH_KYTEA
+tokenizer_plugins_LTLIBRARIES += kytea.la
+endif
 
 mecab_la_CPPFLAGS = $(MECAB_CPPFLAGS)
 mecab_la_SOURCES = mecab.c
 mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS)
 mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS)
+
+kytea_la_CPPFLAGS = $(KYTEA_CPPFLAGS)
+kytea_la_SOURCES = kytea.cpp
+kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS)
+kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS)

  Added: plugins/tokenizers/kytea.cpp (+290 -0) 100644
===================================================================
--- /dev/null
+++ plugins/tokenizers/kytea.cpp    2012-02-17 17:05:22 +0900 (efa2f7c)
@@ -0,0 +1,290 @@
+/* -*- c-basic-offset: 2 -*- */
+/* Copyright(C) 2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <groonga/tokenizer.h>
+
+#include <kytea/kytea.h>
+
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+namespace {
+
+grn_plugin_mutex *kytea_mutex = NULL;
+kytea::KyteaConfig *kytea_config = NULL;
+kytea::Kytea *kytea_tagger = NULL;
+kytea::StringUtil *kytea_util = NULL;
+
+void kytea_init(grn_ctx *ctx);
+void kytea_fin(grn_ctx *ctx);
+
+void kytea_init(grn_ctx *ctx) {
+  if ((kytea_mutex != NULL) || (kytea_config != NULL) ||
+      (kytea_tagger != NULL) || (kytea_util != NULL)) {
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] TokenKytea is already initialized");
+    return;
+  }
+
+  kytea_mutex = grn_plugin_mutex_create(ctx);
+  if (kytea_mutex == NULL) {
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer] grn_plugin_mutex_create() failed");
+    return;
+  }
+
+  kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>(
+      GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig)));
+  if (config != NULL) try {
+    new (config) kytea::KyteaConfig;
+    kytea_config = config;
+    try {
+      kytea_config->setDebug(0);
+      kytea_config->setOnTraining(false);
+      kytea_config->parseRunCommandLine(0, NULL);
+    } catch (...) {
+      kytea_fin(ctx);
+      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                       "[tokenizer] kytea::KyteaConfig settings failed");
+      return;
+    }
+  } catch (...) {
+    GRN_PLUGIN_FREE(ctx, config);
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] kytea::KyteaConfig initialization failed");
+    return;
+  } else {
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer] memory allocation to kytea::KyteaConfig failed");
+    return;
+  }
+
+  kytea::Kytea * const tagger = static_cast<kytea::Kytea *>(
+      GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea)));
+  if (tagger != NULL) try {
+    new (tagger) kytea::Kytea;
+    kytea_tagger = tagger;
+    try {
+      kytea_tagger->readModel(kytea_config->getModelFile().c_str());
+    } catch (...) {
+      kytea_fin(ctx);
+      GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                       "[tokenizer] kytea::Kytea::readModel() failed");
+      return;
+    }
+  } catch (...) {
+    GRN_PLUGIN_FREE(ctx, tagger);
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] kytea::Kytea initialization failed");
+    return;
+  } else {
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer] memory allocation to kytea::Kytea failed");
+    return;
+  }
+
+  try {
+    kytea_util = kytea_tagger->getStringUtil();
+  } catch (...) {
+    kytea_fin(ctx);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] kytea::Kytea::getStringUtil() failed");
+    return;
+  }
+}
+
+void kytea_fin(grn_ctx *ctx) {
+  kytea_util = NULL;
+
+  if (kytea_tagger != NULL) {
+    kytea_tagger->~Kytea();
+    GRN_PLUGIN_FREE(ctx, kytea_tagger);
+    kytea_tagger = NULL;
+  }
+
+  if (kytea_config != NULL) {
+    kytea_config->~KyteaConfig();
+    GRN_PLUGIN_FREE(ctx, kytea_config);
+    kytea_config = NULL;
+  }
+
+  if (kytea_mutex != NULL) {
+    grn_plugin_mutex_destroy(ctx, kytea_mutex);
+    kytea_mutex = NULL;
+  }
+}
+
+struct grn_tokenizer_kytea {
+  grn_tokenizer_query *query;
+  kytea::KyteaSentence sentence;
+  std::vector<std::string> tokens;
+  std::size_t id;
+  grn_tokenizer_token token;
+
+  grn_tokenizer_kytea() : query(NULL), sentence(), tokens(), id(0), token() {}
+  ~grn_tokenizer_kytea() {}
+};
+
+void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
+  new (tokenizer) grn_tokenizer_kytea;
+  grn_tokenizer_token_init(ctx, &tokenizer->token);
+}
+
+void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
+  grn_tokenizer_token_fin(ctx, &tokenizer->token);
+  if (tokenizer->query != NULL) {
+    grn_tokenizer_query_destroy(ctx, tokenizer->query);
+  }
+  tokenizer->~grn_tokenizer_kytea();
+}
+
+grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
+                        grn_user_data *user_data) {
+  grn_tokenizer_query * const query =
+      grn_tokenizer_query_create(ctx, num_args, args);
+  if (query == NULL) {
+    return NULL;
+  }
+
+  grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>(
+      GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea)));
+  if (tokenizer != NULL) try {
+    grn_tokenizer_kytea_init(ctx, tokenizer);
+    tokenizer->query = query;
+  } catch (...) {
+    grn_tokenizer_query_destroy(ctx, query);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] tokenizer initialization failed");
+    return NULL;
+  } else {
+    grn_tokenizer_query_destroy(ctx, query);
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[tokenizer] memory allocation to grn_tokenizer_kytea failed");
+    return NULL;
+  }
+
+  grn_plugin_mutex_lock(ctx, kytea_mutex);
+  try {
+    const std::string str(query->ptr, query->length);
+    tokenizer->sentence = kytea::KyteaSentence(kytea_util->mapString(str));
+    kytea_tagger->calculateWS(tokenizer->sentence);
+  } catch (...) {
+    grn_plugin_mutex_unlock(ctx, kytea_mutex);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] tokenization failed");
+    return NULL;
+  }
+  grn_plugin_mutex_unlock(ctx, kytea_mutex);
+
+  try {
+    for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) {
+      const std::string &token =
+          kytea_util->showString(tokenizer->sentence.words[i].surf);
+      const char *ptr = token.c_str();
+      unsigned int left = static_cast<unsigned int>(token.length());
+      while (left > 0) {
+        const int char_length =
+            grn_tokenizer_charlen(ctx, ptr, left, query->encoding);
+        if ((char_length == 0) ||
+            (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) {
+          break;
+        }
+        ptr += char_length;
+        left -= char_length;
+      }
+      if (left == 0) {
+        tokenizer->tokens.push_back(token);
+      }
+    }
+  } catch (...) {
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer] adjustment failed");
+    return NULL;
+  }
+
+  user_data->ptr = tokenizer;
+  return NULL;
+}
+
+grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args,
+                        grn_user_data *user_data) {
+  grn_tokenizer_kytea * const tokenizer =
+      static_cast<grn_tokenizer_kytea *>(user_data->ptr);
+  const grn_tokenizer_status status =
+      ((tokenizer->id + 1) < tokenizer->tokens.size()) ?
+          GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST;
+  if (tokenizer->id < tokenizer->tokens.size()) {
+    const std::string &token = tokenizer->tokens[tokenizer->id++];
+    grn_tokenizer_token_push(ctx, &tokenizer->token,
+                             token.c_str(), token.length(), status);
+  } else {
+    grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status);
+  } 
+  return NULL;
+}
+
+grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args,
+                       grn_user_data *user_data) {
+  grn_tokenizer_kytea * const tokenizer =
+      static_cast<grn_tokenizer_kytea *>(user_data->ptr);
+  if (tokenizer != NULL) {
+    grn_tokenizer_kytea_fin(ctx, tokenizer);
+    GRN_PLUGIN_FREE(ctx, tokenizer);
+  }
+  return NULL;
+}
+
+}  // namespace
+
+extern "C" {
+
+/*
+  GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error
+  code must be set in `ctx->rc' on failure.
+ */
+grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) {
+  kytea_init(ctx);
+  return ctx->rc;
+}
+
+/*
+  GRN_PLUGIN_REGISTER() registers this plugin to the database associated with
+  `ctx'. The registration requires the plugin name and the functions to be
+  called for tokenization.
+ */
+grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) {
+  return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init,
+                                grn_kytea_next, grn_kytea_fin);
+}
+
+/*
+  GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by
+  GRN_PLUGIN_INIT().
+ */
+grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) {
+  kytea_fin(ctx);
+  return GRN_SUCCESS;
+}
+
+}  // extern "C"




Groonga-commit メーリングリストの案内
Back to archive index