# -*- coding: utf-8 -*-
import re
import htmlentitydefs
re_tag = re.compile(r'<[^>]+>', re.S)
def strip_tags(data):
"""Remove all tags from text"""
return re_tag.sub(' ', data)
def decode_entities(data, encoding=None):
"""Decode things like to normal text"""
def unicode_char_callback(match):
code = match.group(1)
try:
value = unichr(int(code))
except ValueError:
value = code
return value
def entity_callback(match):
entity = match.group(1)
try:
value = htmlentitydefs.name2codepoint[entity]
try:
data = unichr(value)
if encoding:
data = data.encode(encoding)
return data
except UnicodeDecodeError:
pass
except KeyError:
pass
return u'&%s;' % entity
if encoding is None and isinstance(data, str):
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
print 'data encoding is not unicode neither utf-8'
return ''
data = re.sub(r'&([a-z]+);', entity_callback, data)
data = re.sub(r'&#(\d+);', unicode_char_callback, data)
return data