# -*- coding: utf-8 -*-
import re
import htmlentitydefs
re_tag = re.compile(r'<[^>]+>', re.S)
def strip_tags(data):
"""Remove all tags from text"""
return re_tag.sub(' ', data)
def decode_entities(data, encoding=None):
"""Decode things like to normal text"""
def unicode_char_callback(match):
code = match.group(1)
try:
value = unichr(int(code))
except ValueError:
value = code
return value
def entity_callback(match):
entity = match.group(1)
try:
value = htmlentitydefs.name2codepoint[entity]
try:
data = unichr(value)
if encoding:
data = data.encode(encoding)
return data
except UnicodeDecodeError:
pass
except KeyError:
pass
return u'&%s;' % entity
if encoding is None and isinstance(data, str):
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
print 'data encoding is not unicode neither utf-8'
return ''
data = re.sub(r'&([a-z]+);', entity_callback, data)
data = re.sub(r'&#(\d+);', unicode_char_callback, data)
return data
#def decode_html(data):
#"""Do some thins to make nice html"""
#data = decode_entities(data)
#return data
def html2text(data):
"""Make text from html"""
re_br = re.compile(r'<br\s*/?>', re.I)
data = re_br.sub('\n', data)
data = strip_tags(data)
data = decode_entities(data)
data = data.strip()
return data
def detect_encoding(data, encoding=None, curl=None, headers=None):
try_encodings = []
if encoding:
try_encodings.append(encoding)
ct_header = None
if headers:
ct_header = headers.get('Content-Type', '')
elif curl:
ct_header = curl.headers.get('Content-Type', '')
if ct_header:
match = re.search(r'; encoding=([^" ]+)', ct_header, re.S)
if match and math.group(1):
try_encodings.insert(0, match.group(1))
# extract meta encoding
match = re.search(r'; charset=([^" ]+)', data, re.S)
if match and match.group(1):
try_encodings.append(match.group(1))
# try to use chardet
#enc = chardet.detect(data)['encoding']
#if enc:
#try_encodings.add(enc)
try_encodings.extend(['windows-1251', 'koi8-r', 'utf-8'])
for encoding in try_encodings:
try:
data.decode(encoding)
return encoding
except:
pass
return None