• R/O
  • HTTP
  • SSH
  • HTTPS

提交

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

sfjplib for python


Commit MetaInfo

修訂2c4d4f2583be270bfc4e4652264f99ec50fb2202 (tree)
時間2011-07-28 20:00:14
作者Hiromichi MATSUSHIMA <hirom@offi...>
CommiterHiromichi MATSUSHIMA

Log Message

rewrite form_retriver with htmltree

Change Summary

差異

--- a/form_retriver.py
+++ b/form_retriver.py
@@ -4,126 +4,83 @@
44
55 import HTMLParser
66 import re
7+import htmltree
78
8-class Form(dict):
9- def __init__(self, action, method):
10- self.action = action
11- self.method = method
12-
13-class FormItem(object):
14- def __init__(self, name, attrs, value):
15- self.name = name
16- self.attrs = attrs
17- self.value = value
18-
19-class FormRetriver(HTMLParser.HTMLParser):
20- UNCLOSE = ("input")
9+class Form(list):
10+ def __init__(self):
11+ self.elements = []
12+ self.action = None
13+ self.target = None
14+ self.enctype = None
15+ self.method = None
2116
17+class FormRetriver(object):
2218 def __init__(self):
2319 "Constructor"
24- HTMLParser.HTMLParser.__init__(self)
25- self._current_form = None
26- self._stack = ""
27- self._capture = False
28- self._current_element = None
29- self.forms = []
20+ self._forms = []
3021
3122 def parse(self, data):
32- self.feed(data)
33-
34- # Handlers
35- def handle_starttag(self, tag, attrs):
36- if tag == "form":
37- attr = dict(attrs)
38- action = attr.get("action", "")
39- method = attr.get("method", "")
40- f = Form(action, method)
41- self._current_form = f
42- elif tag == "input":
43- self.handle_startendtag(tag, attrs)
44- elif tag == "textarea":
45- self._stack = ""
46- self._capture = True
47- e = FormItem(tag, attrs, "")
48- self._current_element = e
49- elif tag == "option":
50- attr = dict(attrs)
51- val = attr.get("value", "")
52- e = FormItem(tag, attrs, val)
53- if "selected" in attr:
54- self._current_element.value = val
55- if "SELECTED" in attr:
56- self._current_element.value = val
57- elif tag == "select":
58- e = FormItem(tag, attrs, "")
59- self._current_element = e
60-
61- def handle_endtag(self, tag):
62- if tag == "textarea":
63- text = self._stack
64- self._stack = ""
65- self._capture = False
66- e = self._current_element
67- attr = dict(e.attrs)
68- if "NAME" in attr:
69- name = attr["NAME"]
70- elif "name" in attr:
71- name = attr["name"]
72- else:
73- name = ""
74- if name:
75- e.value = text
76- self._current_form[name] = e
77- elif tag == "form":
78- self.forms.append(self._current_form)
79- elif tag == "select":
80- e = self._current_element
81- attr = dict(e.attrs)
82- if "NAME" in attr:
83- name = attr["NAME"]
84- elif "name" in attr:
85- name = attr["name"]
86- else:
87- name = ""
88- if name:
89- self._current_form[name] = e
90-
91- def handle_data(self, data):
92- if self._capture:
93- self._stack = self._stack + data
23+ tree = htmltree.parse(data)
24+ r = tree.root()
25+ forms = r.get_elements_by_name("form")
26+ for f in forms:
27+ self._forms.append(self.form_parse(f))
9428
95- def handle_startendtag(self, tag, attrs):
96- if tag == "input":
97- attr = dict(attrs)
98- e = FormItem(tag, attrs, "")
99- if "NAME" in attr:
100- name = attr["NAME"]
101- elif "name" in attr:
102- name = attr["name"]
103- else:
104- name = ""
105- if "VALUE" in attr:
106- e.value = attr["VALUE"]
107- if "value" in attr:
108- e.value = attr["value"]
109-
110- if name:
111- self._current_form[name] = e
29+ def convert_ref(self, text):
30+ f = lambda x: self._convert_ref(x)
31+ return re.sub(r"&(\w+);", f, text)
11232
113- def handle_entityref(self, name):
114- if self._capture:
115- self._stack = self._stack + self.convert_ref(name)
116-
117- def convert_ref(self, name):
33+ def _convert_ref(self, m):
11834 conv_dict = dict(
11935 lt="<",
12036 gt=">",
12137 amp="&",
122- quot="'"
38+ quot="'",
39+ nbsp=" ",
12340 )
124- return conv_dict[name]
41+ return conv_dict[m.group(1)]
42+
43+ def form_parse(self, elem):
44+ f = Form()
45+ f.action = elem.attr("action")
46+ f.target = elem.attr("target")
47+ f.enctype = elem.attr("enctype")
48+ f.method = elem.attr("method")
49+ self._r_form_parse(elem, f)
50+ return f
51+
52+ def _r_form_parse(self, elem, f):
53+ for e in elem:
54+ if e.name == "input":
55+ if e.attr("name") == None:
56+ continue
57+ f.append((e.attr("name"), e.attr("value")))
58+ f.elements.append(e)
59+ elif e.name == "textarea":
60+ if e.attr("name") == None:
61+ continue
62+ t = e.inner_html().encode("utf-8")
63+ t = self.convert_ref(t)
64+ f.append((e.attr("name"), t))
65+ f.elements.append(e)
66+ elif e.name == "select":
67+ if e.attr("name") == None:
68+ continue
69+ name = e.attr("name")
70+ for opt in e:
71+ if opt.has_attribute("selected"):
72+ f.append((name, opt.attr("value")))
73+ f.elements.append(opt)
74+ elif e.name == "button":
75+ if e.attr("name") == None:
76+ continue
77+ f.append((e.attr("name"), e.attr("value")))
78+ f.elements.append(e)
79+ else:
80+ self._r_form_parse(e, f)
81+
12582
126- def handle_charref(self, ref):
127- if self._capture:
128- self._stack = self._stack + "&#" + ref + ";"
83+ def forms(self):
84+ return self._forms
12985
86+
--- a/test_form_retriver.py
+++ b/test_form_retriver.py
@@ -11,10 +11,10 @@ if __name__ == "__main__":
1111
1212 f = form_retriver.FormRetriver()
1313 f.parse(html)
14- for form in f.forms:
15- print form.action, form.method, ":"
16- for name in form:
17- print name, ":", form[name].value
14+ for form in f.forms():
15+ print "\n", form.action, ":"
16+ for (key, val) in form:
17+ print key, val
1818
1919
2020