#from html5lib import treebuilders, HTMLParser from BeautifulSoup import BeautifulSoup from random import choice from urlparse import urljoin import re from copy import deepcopy from whois.util import random_email, random_password RE_TEXTAREA_START = re.compile(r']*>.+?)', re.S|re.I) RE_INPUT_CHUNK = re.compile(r'(]+>)', re.I|re.S) RE_SELECT_CHUNK = re.compile(r'(]+>.+?)(?=|]+>.+?)(?=|]*>', re.I|re.S) RE_TEXTAREA_TAG = re.compile(r']*>(.+?)(?:]*>([^<]+)', re.I|re.S) # TODO: Make smart regexp which could do ']+)' RE_ACTION_ATTR = re.compile(RE_ATTR_TPL % 'action', re.I) RE_NAME_ATTR = re.compile(RE_ATTR_TPL % 'name', re.I) RE_VALUE_ATTR = re.compile(RE_ATTR_TPL % 'value', re.I) RE_SIZE_ATTR = re.compile(RE_ATTR_TPL % 'size', re.I) RE_TYPE_ATTR = re.compile(RE_ATTR_TPL % 'type', re.I) RE_MAXLENGTH_ATTR = re.compile(RE_ATTR_TPL % 'maxlength', re.I) def parse_attr(html, rex, default=''): try: return rex.search(html).group(1).strip() except AttributeError: return default class FormNotFound(Exception): pass class EmptyForm(Exception): pass class InvalidHtml(Exception): pass def make_soup(data): try: return BeautifulSoup(data) except Exception, ex: raise InvalidHtml(ex.message) def find_form_chunks(data): return RE_FORM_CHUNK.findall(data) def find_form(data): forms = [] for chunk in find_form_chunks(data): #form = make_soup(chunk) forms.append((chunk, count_inputs(chunk))) if not forms: raise FormNotFound() else: return sorted(forms, lambda a, b: cmp(a[1], b[1]), reverse=True)[0][0] def count_inputs(html): count = 0 count += len(RE_TEXTAREA_START.findall(html)) count += len(RE_INPUT_START.findall(html)) count += len(RE_SELECT_START.findall(html)) return count def parse_int(text): try: return int(text) except ValueError: return 0 class Form(object): def __init__(self, html, url, clone_instance=None): if clone_instance: self.action = clone_instance.action self.fields = deepcopy(clone_instance.fields) else: self.parse_fields(html) self.parse_form_tag(html, url) def parse_form_tag(self, html, url): tag = RE_FORM_TAG.search(html).group(0) match = RE_ACTION_ATTR.search(tag) if match: self.action = urljoin(url, match.group(1)) else: self.action = url def parse_fields(self, html): self.fields = {} rexlist = [('input', RE_INPUT_CHUNK), ('textarea', RE_TEXTAREA_CHUNK), ('select', RE_SELECT_CHUNK)] for rtype, rex in rexlist: for match in rex.finditer(html): self.parse_field(match.group(1), rtype) def clone(self): return Form(None, None, clone_instance=self) def parse_field(self, html, input_type): name = parse_attr(html, RE_NAME_ATTR) if name: if input_type == 'input': ftype = parse_attr(html, RE_TYPE_ATTR, 'text') else: ftype = input_type field = { 'name': name, 'hidden': ftype == 'hidden', 'maxlength': parse_int(parse_attr(html, RE_MAXLENGTH_ATTR, '0')), 'size': parse_int(parse_attr(html, RE_SIZE_ATTR, '0')), 'type': ftype, } # DIRTY HACK. NEEDS MORE SMART SUPPPORT FOR RADIO ELEMENT if ftype == 'radio': ftype = 'text' if ftype in ['text', 'hidden', 'submit']: field['value'] = parse_attr(html, RE_VALUE_ATTR) elif ftype == 'password': field['value'] = random_password() elif ftype == 'checkbox': field['value'] = parse_attr(html, RE_VALUE_ATTR) elif ftype == 'textarea': match = RE_TEXTAREA_TAG.search(html) field['value'] = match.group(1) elif ftype == 'select': options = [] for match in RE_OPTION_TAG.finditer(html): value = parse_attr(match.group(0), RE_VALUE_ATTR) if value: options.append(value) else: options.append(match.group(1).strip()) if not options: return field['options'] = options if len(options) < 3: value = options[0] else: value = choice(options[1:]) field['value'] = value else: return self.fields[name] = field if not self.fields: raise EmptyForm()