# -*- coding: utf-8 -*- import re import htmlentitydefs re_tag = re.compile(r'<[^>]+>', re.S) def strip_tags(data): """Remove all tags from text""" return re_tag.sub(' ', data) def decode_entities(data, encoding=None): """Decode things like   to normal text""" def unicode_char_callback(match): code = match.group(1) try: value = unichr(int(code)) except ValueError: value = code return value def entity_callback(match): entity = match.group(1) try: value = htmlentitydefs.name2codepoint[entity] try: data = unichr(value) if encoding: data = data.encode(encoding) return data except UnicodeDecodeError: pass except KeyError: pass return u'&%s;' % entity if encoding is None and isinstance(data, str): try: data = data.decode('utf-8') except UnicodeDecodeError: print 'data encoding is not unicode neither utf-8' return '' data = re.sub(r'&([a-z]+);', entity_callback, data) data = re.sub(r'&#(\d+);', unicode_char_callback, data) return data #def decode_html(data): #"""Do some thins to make nice html""" #data = decode_entities(data) #return data def html2text(data): """Make text from html""" re_br = re.compile(r'', re.I) data = re_br.sub('\n', data) data = strip_tags(data) data = decode_entities(data) return data def detect_encoding(data, encoding=None, curl=None, headers=None): try_encodings = [] if encoding: try_encodings.append(encoding) ct_header = None if headers: ct_header = headers.get('Content-Type', '') elif curl: ct_header = curl.headers.get('Content-Type', '') if ct_header: match = re.search(r'; encoding=([^" ]+)', ct_header, re.S) if match and math.group(1): try_encodings.insert(0, match.group(1)) # extract meta encoding match = re.search(r'; charset=([^" ]+)', data, re.S) if match and match.group(1): try_encodings.append(match.group(1)) # try to use chardet #enc = chardet.detect(data)['encoding'] #if enc: #try_encodings.add(enc) try_encodings.extend(['windows-1251', 'koi8-r', 'utf-8']) for encoding in try_encodings: try: data.decode(encoding) return encoding except: pass return None