from html5lib import treebuilders HTMLParser from BeautifulSoup import

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#from html5lib import treebuilders, HTMLParser
from BeautifulSoup import BeautifulSoup
from random import choice
from urlparse import urljoin
import re
from copy import deepcopy
from whois.util import random_email, random_password
RE_TEXTAREA_START = re.compile(r'<textarea', re.I)
RE_INPUT_START = re.compile(r'<input', re.I)
RE_SELECT_START = re.compile(r'<select', re.I)
RE_FORM_CHUNK = re.compile('(<form[^>]*>.+?</form>)', re.S|re.I)
RE_INPUT_CHUNK = re.compile(r'(<input[^>]+>)', re.I|re.S)
RE_SELECT_CHUNK = re.compile(r'(<select[^>]+>.+?)(?=</select>|<input|<select|<textarea|$)', re.I|re.S)
RE_TEXTAREA_CHUNK = re.compile(r'(<textarea[^>]+>.+?)(?=</textarea>|<input|<select|<textarea|$)', re.I|re.S)
RE_FORM_TAG = re.compile(r'<form[^>]*>', re.I|re.S)
RE_TEXTAREA_TAG = re.compile(r'<textarea[^>]*>(.+?)(?:</textarea|$)', re.I|re.S)
RE_OPTION_TAG = re.compile(r'<option[^>]*>([^<]+)', re.I|re.S)
# TODO: Make smart regexp which could do '<foo=bar baz=gaz' and '<foo="bar baz"'
RE_ATTR_TPL = r'%s\s*=\s*["\']?([^ "\'>]+)'
RE_ACTION_ATTR = re.compile(RE_ATTR_TPL % 'action', re.I)
RE_NAME_ATTR = re.compile(RE_ATTR_TPL % 'name', re.I)
RE_VALUE_ATTR = re.compile(RE_ATTR_TPL % 'value', re.I)
RE_SIZE_ATTR = re.compile(RE_ATTR_TPL % 'size', re.I)
RE_TYPE_ATTR = re.compile(RE_ATTR_TPL % 'type', re.I)
RE_MAXLENGTH_ATTR = re.compile(RE_ATTR_TPL % 'maxlength', re.I)
def parse_attr(html, rex, default=''):
try:
return rex.search(html).group(1).strip()
except AttributeError:
return default
class FormNotFound(Exception):
pass
class EmptyForm(Exception):
pass
class InvalidHtml(Exception):
pass
def make_soup(data):
try:
return BeautifulSoup(data)
except Exception, ex:
raise InvalidHtml(ex.message)
def find_form_chunks(data):
return RE_FORM_CHUNK.findall(data)
def find_form(data):
forms = []
for chunk in find_form_chunks(data):
#form = make_soup(chunk)
forms.append((chunk, count_inputs(chunk)))
if not forms:
raise FormNotFound()
else:
return sorted(forms, lambda a, b: cmp(a[1], b[1]), reverse=True)[0][0]
def count_inputs(html):
count = 0
count += len(RE_TEXTAREA_START.findall(html))
count += len(RE_INPUT_START.findall(html))
count += len(RE_SELECT_START.findall(html))
return count
def parse_int(text):
try:
return int(text)
except ValueError:
return 0
class Form(object):
def __init__(self, html, url, clone_instance=None):
if clone_instance:
self.action = clone_instance.action
self.fields = deepcopy(clone_instance.fields)
else:
self.parse_fields(html)
self.parse_form_tag(html, url)
def parse_form_tag(self, html, url):
tag = RE_FORM_TAG.search(html).group(0)
match = RE_ACTION_ATTR.search(tag)
if match:
self.action = urljoin(url, match.group(1))
else:
self.action = url
def parse_fields(self, html):
self.fields = {}
rexlist = [('input', RE_INPUT_CHUNK),
('textarea', RE_TEXTAREA_CHUNK),
('select', RE_SELECT_CHUNK)]
for rtype, rex in rexlist:
for match in rex.finditer(html):
self.parse_field(match.group(1), rtype)
def clone(self):
return Form(None, None, clone_instance=self)
def parse_field(self, html, input_type):
name = parse_attr(html, RE_NAME_ATTR)
if name:
if input_type == 'input':
ftype = parse_attr(html, RE_TYPE_ATTR, 'text')
else:
ftype = input_type
field = {
'name': name,
'hidden': ftype == 'hidden',
'maxlength': parse_int(parse_attr(html, RE_MAXLENGTH_ATTR, '0')),
'size': parse_int(parse_attr(html, RE_SIZE_ATTR, '0')),
'type': ftype,
}
# DIRTY HACK. NEEDS MORE SMART SUPPPORT FOR RADIO ELEMENT
if ftype == 'radio':
ftype = 'text'
if ftype in ['text', 'hidden', 'submit']:
field['value'] = parse_attr(html, RE_VALUE_ATTR)
elif ftype == 'password':
field['value'] = random_password()
elif ftype == 'checkbox':
field['value'] = parse_attr(html, RE_VALUE_ATTR)
elif ftype == 'textarea':
match = RE_TEXTAREA_TAG.search(html)
field['value'] = match.group(1)
elif ftype == 'select':
options = []
for match in RE_OPTION_TAG.finditer(html):
value = parse_attr(match.group(0), RE_VALUE_ATTR)
if value:
options.append(value)
else:
options.append(match.group(1).strip())
if not options:
return
field['options'] = options
if len(options) < 3:
value = options[0]
else:
value = choice(options[1:])
field['value'] = value
else:
return
self.fields[name] = field
if not self.fields:
raise EmptyForm()