[Groonga-commit] ranguba/chupa-text-decomposer-html at c2bc94a [master] Ignore needless contents

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Jul 5 16:57:43 JST 2017


Kouhei Sutou	2017-07-05 16:57:43 +0900 (Wed, 05 Jul 2017)

  New Revision: c2bc94ac549538f134706b66a721dc622bc2c19c
  https://github.com/ranguba/chupa-text-decomposer-html/commit/c2bc94ac549538f134706b66a721dc622bc2c19c

  Message:
    Ignore needless contents

  Modified files:
    lib/chupa-text/decomposers/html.rb
    test/test-html.rb

  Modified: lib/chupa-text/decomposers/html.rb (+78 -1)
===================================================================
--- lib/chupa-text/decomposers/html.rb    2017-07-05 16:30:52 +0900 (b23c991)
+++ lib/chupa-text/decomposers/html.rb    2017-07-05 16:57:43 +0900 (b753dc5)
@@ -45,7 +45,7 @@ module ChupaText
         doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
         body_element = (doc % "body")
         if body_element
-          body = body_element.text.scrub.gsub(/^\s+|\s+$/, '')
+          body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
         else
           body = ""
         end
@@ -104,6 +104,83 @@ module ChupaText
       def guess_encoding_nkf(text)
         NKF.guess(text).name
       end
+
+      def extract_text(element, text)
+        name = element.name.downcase
+        classes = (element["class"] || "").split
+        return text if noindex_element?(element, name, classes)
+        return text if header_element?(element, name, classes)
+        return text if footer_element?(element, name, classes)
+
+        element.children.each do |child|
+          case child
+          when Nokogiri::XML::Text
+            text << child.text
+          when Nokogiri::XML::Element
+            extract_text(child, text)
+          end
+        end
+
+        text
+      end
+
+      def noindex_element?(element, name, classes)
+        case name
+        when "script", "noscript", "link", "style"
+          return true
+        end
+
+        classes.each do |klass|
+          case klass
+          when "noindex", "robots-noindex"
+            return true
+          end
+        end
+
+        false
+      end
+
+      def header_element?(element, name, classes)
+        case name
+        when "header", "nav"
+          return true
+        end
+
+        classes.each do |klass|
+          case klass
+          when "header"
+            return true
+          end
+        end
+
+        case element["id"]
+        when "header"
+          return true
+        end
+
+        false
+      end
+
+      def footer_element?(element, name, classes)
+        case name
+        when "footer"
+          return true
+        end
+
+        classes.each do |klass|
+          case klass
+          when "footer"
+            return true
+          end
+        end
+
+        case element["id"]
+        when "footer"
+          return true
+        end
+
+        false
+      end
     end
   end
 end

  Modified: test/test-html.rb (+132 -0)
===================================================================
--- test/test-html.rb    2017-07-05 16:30:52 +0900 (b5b24a7)
+++ test/test-html.rb    2017-07-05 16:57:43 +0900 (1477677)
@@ -273,5 +273,137 @@ class TestHTML < Test::Unit::TestCase
         end
       end
     end
+
+    sub_test_case("body") do
+      def normalize_decomposed_data(decomposed_data)
+        decomposed_data.body
+      end
+
+      sub_test_case("noindex") do
+        def test_script
+          @data.body = <<-HTML
+<html>
+  <body>Before<script>var x;</script>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_noscript
+          @data.body = <<-HTML
+<html>
+  <body>Before<noscript>Enable JavaScript!</noscript>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_link
+          @data.body = <<-HTML
+<html>
+  <body>Before<link rel="stylehseet">After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_style
+          @data.body = <<-HTML
+<html>
+  <body>Before<style>a {color: "red";}</style>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_noindex
+          @data.body = <<-HTML
+<html>
+  <body>Before<div class="noindex">header</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_robots_noindex
+          @data.body = <<-HTML
+<html>
+  <body>Before<div class="robots-noindex">header</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+      end
+
+      sub_test_case("header") do
+        def test_tag
+          @data.body = <<-HTML
+<html>
+  <body>Before<header>header</header>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_class
+          @data.body = <<-HTML
+<html>
+  <body>Before<div class="header">header</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_id
+          @data.body = <<-HTML
+<html>
+  <body>Before<div id="header">header</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+      end
+
+      sub_test_case("footer") do
+        def test_tag
+          @data.body = <<-HTML
+<html>
+  <body>Before<footer>footer</footer>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_class
+          @data.body = <<-HTML
+<html>
+  <body>Before<div class="footer">footer</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+
+        def test_id
+          @data.body = <<-HTML
+<html>
+  <body>Before<div id="footer">footer</div>After</body>
+</html>
+          HTML
+          assert_equal(["BeforeAfter"],
+                       decompose(@data))
+        end
+      end
+    end
   end
 end
-------------- next part --------------
HTML����������������������������...
下載 



More information about the Groonga-commit mailing list
Back to archive index