ranguba/chupa-text at 477e000 [master] Improve UTF-8 conversion (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2019-03-01 10:25:53 +0900 (Fri, 01 Mar 2019)

  Revision: 477e00019f03f88b149ab5c46a828c497e662cb9
  https://github.com/ranguba/chupa-text/commit/477e00019f03f88b149ab5c46a828c497e662cb9

  Message:
    Improve UTF-8 conversion

  Added files:
    lib/chupa-text/utf8-converter.rb
  Modified files:
    lib/chupa-text/data.rb
    lib/chupa-text/extractor.rb
    test/test-extractor.rb

  Modified: lib/chupa-text/data.rb (+15 -1)
===================================================================

--- lib/chupa-text/data.rb    2019-02-28 15:14:11 +0900 (15ba6f7)
+++ lib/chupa-text/data.rb    2019-03-01 10:25:53 +0900 (dc61947)
@@ -1,4 +1,4 @@
-# Copyright (C) 2013-2017  Kouhei Sutou <kou****@clear*****>
+# Copyright (C) 2013-2019  Kouhei Sutou <kou****@clear*****>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -18,6 +18,8 @@ require "cgi/util"
 require "uri"
 require "open-uri"
 
+require "chupa-text/utf8-converter"
+
 module ChupaText
   class Data
     # @return [URI, nil] The URI of the data if the data is for remote
@@ -190,6 +192,18 @@ module ChupaText
       @need_screenshot
     end
 
+    def to_utf8_body_data
+      b = body
+      return self if b.nil?
+      converter = UTF8Converter.new(b)
+      utf8_body = converter.convert
+      if b.equal?(utf8_body)
+        self
+      else
+        TextData.new(utf8_body, source_data: self)
+      end
+    end
+
     private
     def guess_mime_type
       guess_mime_type_from_uri or

  Modified: lib/chupa-text/extractor.rb (+3 -86)
===================================================================
--- lib/chupa-text/extractor.rb    2019-02-28 15:14:11 +0900 (115f1f7)
+++ lib/chupa-text/extractor.rb    2019-03-01 10:25:53 +0900 (920b7d9)
@@ -1,4 +1,4 @@
-# Copyright (C) 2013-2017  Kouhei Sutou <kou****@clear*****>
+# Copyright (C) 2013-2019  Kouhei Sutou <kou****@clear*****>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -68,12 +68,12 @@ module ChupaText
         if decomposer.nil?
           if target.text_plain?
             debug {"#{log_tag}[extract][text-plain]"}
-            yield(ensure_utf8_body_data(target))
+            yield(target.to_utf8_body_data)
             next
           else
             debug {"#{log_tag}[extract][decomposer] not found"}
             if target.text?
-              yield(ensure_utf8_body_data(target))
+              yield(target.to_utf8_body_data)
             end
             next
           end
@@ -100,89 +100,6 @@ module ChupaText
       end
     end
 
-    def ensure_utf8_body_data(data)
-      body = data.body
-      return dat if body.nil?
-
-      encoding = body.encoding
-      case encoding
-      when Encoding::UTF_8
-        bom_size, bom_encoding = detect_bom(body)
-        if bom_size
-          body_without_bom = body.byteslice(bom_size,
-                                            body.byteslice - bom_size)
-          return TextData.new(body_without_bom, source_data: data)
-        else
-          return data
-        end
-      when Encoding::ASCII_8BIT
-        return data if body.ascii_only?
-      else
-        utf8_body = body.encode(Encoding::UTF_8,
-                                invalid: :replace,
-                                undef: :replace,
-                                replace: "")
-        return TextData.new(utf8_body, source_data: data)
-      end
-
-      bom_size, bom_encoding = detect_bom(body)
-      if bom_encoding
-        body_without_bom = body.byteslice(bom_size, body.bytesize - bom_size)
-        utf8_body = body_without_bom.encode(Encoding::UTF_8,
-                                            bom_encoding,
-                                            invalid: :replace,
-                                            undef: :replace,
-                                            replace: "")
-        return TextData.new(utf8_body, source_data: data)
-      end
-
-      candidates = [
-        Encoding::UTF_8,
-        Encoding::EUC_JP,
-        Encoding::Windows_31J,
-      ]
-      candidates.each do |candidate|
-        body.force_encoding(candidate)
-        if body.valid_encoding?
-          utf8_body = body.encode(Encoding::UTF_8,
-                                  invalid: :replace,
-                                  undef: :replace,
-                                  replace: "")
-          return TextData.new(utf8_body, source_data: data)
-        end
-      end
-      body.force_encoding(encoding)
-      data
-    end
-
-    UTF_8_BOM = "\xef\xbb\xbf".b
-    UTF_16BE_BOM = "\xfe\xff".b
-    UTF_16LE_BOM = "\xff\xfe".b
-    UTF_32BE_BOM = "\x00\x00\xfe\xff".b
-    UTF_32LE_BOM = "\xff\xfe\x00\x00".b
-    def detect_bom(text)
-      case text.byteslice(0, 4).b
-      when UTF_32BE_BOM
-        return 4, Encoding::UTF_32BE
-      when UTF_32LE_BOM
-        return 4, Encoding::UTF_32LE
-      end
-
-      case text.byteslice(0, 3).b
-      when UTF_8_BOM
-        return 3, Encoding::UTF_8
-      end
-
-      case text.byteslice(0, 2).b
-      when UTF_16BE_BOM
-        return 2, Encoding::UTF_16BE
-      when UTF_16LE_BOM
-        return 2, Encoding::UTF_16LE
-      end
-
-      nil
-    end
-
     def find_decomposer(data)
       candidates = []
       @decomposers.each do |decomposer|

  Added: lib/chupa-text/utf8-converter.rb (+117 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/utf8-converter.rb    2019-03-01 10:25:53 +0900 (ca72e90)
@@ -0,0 +1,117 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+module ChupaText
+  class UTF8Converter
+    def initialize(string)
+      @string = string
+    end
+
+    def convert
+      encoding =****@strin*****
+      case encoding
+      when Encoding::UTF_8
+        bom_size, bom_encoding = detect_bom
+        if bom_size
+          retur****@strin*****(bom_size,
+                                   @string.bytesize - bom_size)
+        else
+          return @string
+        end
+      when Encoding::ASCII_8BIT
+        return @string if****@strin*****_only?
+      else
+        retur****@strin*****(Encoding::UTF_8,
+                              invalid: :replace,
+                              undef: :replace,
+                              replace: "")
+      end
+
+      bom_size, bom_encoding = detect_bom
+      if bom_encoding
+        string_without_bom =****@strin*****(bom_size,
+                                               @string.bytesize - bom_size)
+        return string_without_bom.encode(Encoding::UTF_8,
+                                         bom_encoding,
+                                         invalid: :replace,
+                                         undef: :replace,
+                                         replace: "")
+      end
+
+      guessed_encoding = guess_encoding
+      if guessed_encoding
+        @string.encode(Encoding::UTF_8,
+                       guessed_encoding,
+                       invalid: :replace,
+                       undef: :replace,
+                       replace: "")
+      else
+        utf8_body =****@strin*****
+        utf8_body.force_encoding(Encoding::UTF_8)
+        utf8_body.scrub!("")
+        utf8_body.gsub!(/\p{Control}+/, "")
+        utf8_body
+      end
+    end
+
+    private
+    UTF_8_BOM = "\xef\xbb\xbf".b
+    UTF_16BE_BOM = "\xfe\xff".b
+    UTF_16LE_BOM = "\xff\xfe".b
+    UTF_32BE_BOM = "\x00\x00\xfe\xff".b
+    UTF_32LE_BOM = "\xff\xfe\x00\x00".b
+    def detect_bom
+      case****@strin*****(0, 4).b
+      when UTF_32BE_BOM
+        return 4, Encoding::UTF_32BE
+      when UTF_32LE_BOM
+        return 4, Encoding::UTF_32LE
+      end
+
+      case****@strin*****(0, 3).b
+      when UTF_8_BOM
+        return 3, Encoding::UTF_8
+      end
+
+      case****@strin*****(0, 2).b
+      when UTF_16BE_BOM
+        return 2, Encoding::UTF_16BE
+      when UTF_16LE_BOM
+        return 2, Encoding::UTF_16LE
+      end
+
+      nil
+    end
+
+    def guess_encoding
+      original_encoding =****@strin*****
+      begin
+        candidates = [
+          Encoding::UTF_8,
+          Encoding::EUC_JP,
+          Encoding::Windows_31J,
+        ]
+        candidates.each do |candidate|
+          @string.force_encoding(candidate)
+          return candidate if****@strin*****_encoding?
+        end
+        nil
+      ensure
+        @string.force_encoding(original_encoding)
+      end
+    end
+  end
+end

  Modified: test/test-extractor.rb (+7 -0)
===================================================================
--- test/test-extractor.rb    2019-02-28 15:14:11 +0900 (050d38d)
+++ test/test-extractor.rb    2019-03-01 10:25:53 +0900 (0598f9f)
@@ -137,6 +137,13 @@ class TestExtractor < Test::Unit::TestCase
         assert_equal(["こんにちは"], extract(data))
       end
 
+      def test_utf8_broken
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "\x82\x00こんにちは".b
+        assert_equal(["こんにちは"], extract(data))
+      end
+
       def test_utf16_le
         data = ChupaText::Data.new
         data.mime_type = "text/plain"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/490beda4/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] ranguba/chupa-text at 477e000 [master] Improve UTF-8 conversion