Kouhei Sutou 2019-03-01 10:25:53 +0900 (Fri, 01 Mar 2019) Revision: 477e00019f03f88b149ab5c46a828c497e662cb9 https://github.com/ranguba/chupa-text/commit/477e00019f03f88b149ab5c46a828c497e662cb9 Message: Improve UTF-8 conversion Added files: lib/chupa-text/utf8-converter.rb Modified files: lib/chupa-text/data.rb lib/chupa-text/extractor.rb test/test-extractor.rb Modified: lib/chupa-text/data.rb (+15 -1) =================================================================== --- lib/chupa-text/data.rb 2019-02-28 15:14:11 +0900 (15ba6f7) +++ lib/chupa-text/data.rb 2019-03-01 10:25:53 +0900 (dc61947) @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2017 Kouhei Sutou <kou****@clear*****> +# Copyright (C) 2013-2019 Kouhei Sutou <kou****@clear*****> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -18,6 +18,8 @@ require "cgi/util" require "uri" require "open-uri" +require "chupa-text/utf8-converter" + module ChupaText class Data # @return [URI, nil] The URI of the data if the data is for remote @@ -190,6 +192,18 @@ module ChupaText @need_screenshot end + def to_utf8_body_data + b = body + return self if b.nil? + converter = UTF8Converter.new(b) + utf8_body = converter.convert + if b.equal?(utf8_body) + self + else + TextData.new(utf8_body, source_data: self) + end + end + private def guess_mime_type guess_mime_type_from_uri or Modified: lib/chupa-text/extractor.rb (+3 -86) =================================================================== --- lib/chupa-text/extractor.rb 2019-02-28 15:14:11 +0900 (115f1f7) +++ lib/chupa-text/extractor.rb 2019-03-01 10:25:53 +0900 (920b7d9) @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2017 Kouhei Sutou <kou****@clear*****> +# Copyright (C) 2013-2019 Kouhei Sutou <kou****@clear*****> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -68,12 +68,12 @@ module ChupaText if decomposer.nil? if target.text_plain? debug {"#{log_tag}[extract][text-plain]"} - yield(ensure_utf8_body_data(target)) + yield(target.to_utf8_body_data) next else debug {"#{log_tag}[extract][decomposer] not found"} if target.text? - yield(ensure_utf8_body_data(target)) + yield(target.to_utf8_body_data) end next end @@ -100,89 +100,6 @@ module ChupaText end end - def ensure_utf8_body_data(data) - body = data.body - return dat if body.nil? - - encoding = body.encoding - case encoding - when Encoding::UTF_8 - bom_size, bom_encoding = detect_bom(body) - if bom_size - body_without_bom = body.byteslice(bom_size, - body.byteslice - bom_size) - return TextData.new(body_without_bom, source_data: data) - else - return data - end - when Encoding::ASCII_8BIT - return data if body.ascii_only? - else - utf8_body = body.encode(Encoding::UTF_8, - invalid: :replace, - undef: :replace, - replace: "") - return TextData.new(utf8_body, source_data: data) - end - - bom_size, bom_encoding = detect_bom(body) - if bom_encoding - body_without_bom = body.byteslice(bom_size, body.bytesize - bom_size) - utf8_body = body_without_bom.encode(Encoding::UTF_8, - bom_encoding, - invalid: :replace, - undef: :replace, - replace: "") - return TextData.new(utf8_body, source_data: data) - end - - candidates = [ - Encoding::UTF_8, - Encoding::EUC_JP, - Encoding::Windows_31J, - ] - candidates.each do |candidate| - body.force_encoding(candidate) - if body.valid_encoding? - utf8_body = body.encode(Encoding::UTF_8, - invalid: :replace, - undef: :replace, - replace: "") - return TextData.new(utf8_body, source_data: data) - end - end - body.force_encoding(encoding) - data - end - - UTF_8_BOM = "\xef\xbb\xbf".b - UTF_16BE_BOM = "\xfe\xff".b - UTF_16LE_BOM = "\xff\xfe".b - UTF_32BE_BOM = "\x00\x00\xfe\xff".b - UTF_32LE_BOM = "\xff\xfe\x00\x00".b - def detect_bom(text) - case text.byteslice(0, 4).b - when UTF_32BE_BOM - return 4, Encoding::UTF_32BE - when UTF_32LE_BOM - return 4, Encoding::UTF_32LE - end - - case text.byteslice(0, 3).b - when UTF_8_BOM - return 3, Encoding::UTF_8 - end - - case text.byteslice(0, 2).b - when UTF_16BE_BOM - return 2, Encoding::UTF_16BE - when UTF_16LE_BOM - return 2, Encoding::UTF_16LE - end - - nil - end - def find_decomposer(data) candidates = [] @decomposers.each do |decomposer| Added: lib/chupa-text/utf8-converter.rb (+117 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/utf8-converter.rb 2019-03-01 10:25:53 +0900 (ca72e90) @@ -0,0 +1,117 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +module ChupaText + class UTF8Converter + def initialize(string) + @string = string + end + + def convert + encoding =****@strin***** + case encoding + when Encoding::UTF_8 + bom_size, bom_encoding = detect_bom + if bom_size + retur****@strin*****(bom_size, + @string.bytesize - bom_size) + else + return @string + end + when Encoding::ASCII_8BIT + return @string if****@strin*****_only? + else + retur****@strin*****(Encoding::UTF_8, + invalid: :replace, + undef: :replace, + replace: "") + end + + bom_size, bom_encoding = detect_bom + if bom_encoding + string_without_bom =****@strin*****(bom_size, + @string.bytesize - bom_size) + return string_without_bom.encode(Encoding::UTF_8, + bom_encoding, + invalid: :replace, + undef: :replace, + replace: "") + end + + guessed_encoding = guess_encoding + if guessed_encoding + @string.encode(Encoding::UTF_8, + guessed_encoding, + invalid: :replace, + undef: :replace, + replace: "") + else + utf8_body =****@strin***** + utf8_body.force_encoding(Encoding::UTF_8) + utf8_body.scrub!("") + utf8_body.gsub!(/\p{Control}+/, "") + utf8_body + end + end + + private + UTF_8_BOM = "\xef\xbb\xbf".b + UTF_16BE_BOM = "\xfe\xff".b + UTF_16LE_BOM = "\xff\xfe".b + UTF_32BE_BOM = "\x00\x00\xfe\xff".b + UTF_32LE_BOM = "\xff\xfe\x00\x00".b + def detect_bom + case****@strin*****(0, 4).b + when UTF_32BE_BOM + return 4, Encoding::UTF_32BE + when UTF_32LE_BOM + return 4, Encoding::UTF_32LE + end + + case****@strin*****(0, 3).b + when UTF_8_BOM + return 3, Encoding::UTF_8 + end + + case****@strin*****(0, 2).b + when UTF_16BE_BOM + return 2, Encoding::UTF_16BE + when UTF_16LE_BOM + return 2, Encoding::UTF_16LE + end + + nil + end + + def guess_encoding + original_encoding =****@strin***** + begin + candidates = [ + Encoding::UTF_8, + Encoding::EUC_JP, + Encoding::Windows_31J, + ] + candidates.each do |candidate| + @string.force_encoding(candidate) + return candidate if****@strin*****_encoding? + end + nil + ensure + @string.force_encoding(original_encoding) + end + end + end +end Modified: test/test-extractor.rb (+7 -0) =================================================================== --- test/test-extractor.rb 2019-02-28 15:14:11 +0900 (050d38d) +++ test/test-extractor.rb 2019-03-01 10:25:53 +0900 (0598f9f) @@ -137,6 +137,13 @@ class TestExtractor < Test::Unit::TestCase assert_equal(["こんにちは"], extract(data)) end + def test_utf8_broken + data = ChupaText::Data.new + data.mime_type = "text/plain" + data.body = "\x82\x00こんにちは".b + assert_equal(["こんにちは"], extract(data)) + end + def test_utf16_le data = ChupaText::Data.new data.mime_type = "text/plain" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/490beda4/attachment-0001.html>