#from html5lib import treebuilders, HTMLParser
from BeautifulSoup import BeautifulSoup
from random import choice
from urlparse import urljoin
import re
from copy import deepcopy
from whois.util import random_email, random_password
RE_TEXTAREA_START = re.compile(r'|]*>', re.I|re.S)
RE_TEXTAREA_TAG = re.compile(r']*>([^<]+)', re.I|re.S)
# TODO: Make smart regexp which could do ']+)'
RE_ACTION_ATTR = re.compile(RE_ATTR_TPL % 'action', re.I)
RE_NAME_ATTR = re.compile(RE_ATTR_TPL % 'name', re.I)
RE_VALUE_ATTR = re.compile(RE_ATTR_TPL % 'value', re.I)
RE_SIZE_ATTR = re.compile(RE_ATTR_TPL % 'size', re.I)
RE_TYPE_ATTR = re.compile(RE_ATTR_TPL % 'type', re.I)
RE_MAXLENGTH_ATTR = re.compile(RE_ATTR_TPL % 'maxlength', re.I)
def parse_attr(html, rex, default=''):
try:
return rex.search(html).group(1).strip()
except AttributeError:
return default
class FormNotFound(Exception):
pass
class EmptyForm(Exception):
pass
class InvalidHtml(Exception):
pass
def make_soup(data):
try:
return BeautifulSoup(data)
except Exception, ex:
raise InvalidHtml(ex.message)
def find_form_chunks(data):
return RE_FORM_CHUNK.findall(data)
def find_form(data):
forms = []
for chunk in find_form_chunks(data):
#form = make_soup(chunk)
forms.append((chunk, count_inputs(chunk)))
if not forms:
raise FormNotFound()
else:
return sorted(forms, lambda a, b: cmp(a[1], b[1]), reverse=True)[0][0]
def count_inputs(html):
count = 0
count += len(RE_TEXTAREA_START.findall(html))
count += len(RE_INPUT_START.findall(html))
count += len(RE_SELECT_START.findall(html))
return count
def parse_int(text):
try:
return int(text)
except ValueError:
return 0
class Form(object):
def __init__(self, html, url, clone_instance=None):
if clone_instance:
self.action = clone_instance.action
self.fields = deepcopy(clone_instance.fields)
else:
self.parse_fields(html)
self.parse_form_tag(html, url)
def parse_form_tag(self, html, url):
tag = RE_FORM_TAG.search(html).group(0)
match = RE_ACTION_ATTR.search(tag)
if match:
self.action = urljoin(url, match.group(1))
else:
self.action = url
def parse_fields(self, html):
self.fields = {}
rexlist = [('input', RE_INPUT_CHUNK),
('textarea', RE_TEXTAREA_CHUNK),
('select', RE_SELECT_CHUNK)]
for rtype, rex in rexlist:
for match in rex.finditer(html):
self.parse_field(match.group(1), rtype)
def clone(self):
return Form(None, None, clone_instance=self)
def parse_field(self, html, input_type):
name = parse_attr(html, RE_NAME_ATTR)
if name:
if input_type == 'input':
ftype = parse_attr(html, RE_TYPE_ATTR, 'text')
else:
ftype = input_type
field = {
'name': name,
'hidden': ftype == 'hidden',
'maxlength': parse_int(parse_attr(html, RE_MAXLENGTH_ATTR, '0')),
'size': parse_int(parse_attr(html, RE_SIZE_ATTR, '0')),
'type': ftype,
}
# DIRTY HACK. NEEDS MORE SMART SUPPPORT FOR RADIO ELEMENT
if ftype == 'radio':
ftype = 'text'
if ftype in ['text', 'hidden', 'submit']:
field['value'] = parse_attr(html, RE_VALUE_ATTR)
elif ftype == 'password':
field['value'] = random_password()
elif ftype == 'checkbox':
field['value'] = parse_attr(html, RE_VALUE_ATTR)
elif ftype == 'textarea':
match = RE_TEXTAREA_TAG.search(html)
field['value'] = match.group(1)
elif ftype == 'select':
options = []
for match in RE_OPTION_TAG.finditer(html):
value = parse_attr(match.group(0), RE_VALUE_ATTR)
if value:
options.append(value)
else:
options.append(match.group(1).strip())
if not options:
return
field['options'] = options
if len(options) < 3:
value = options[0]
else:
value = choice(options[1:])
field['value'] = value
else:
return
self.fields[name] = field
if not self.fields:
raise EmptyForm()