# -*- coding: utf-8 -*- import re import htmlentitydefs re_tag = re.compile(r'<[^>]+>', re.S) def strip_tags(data): """Remove all tags from text""" return re_tag.sub(' ', data) def decode_entities(data, encoding=None): """Decode things like   to normal text""" def unicode_char_callback(match): code = match.group(1) try: value = unichr(int(code)) except ValueError: value = code return value def entity_callback(match): entity = match.group(1) try: value = htmlentitydefs.name2codepoint[entity] try: data = unichr(value) if encoding: data = data.encode(encoding) return data except UnicodeDecodeError: pass except KeyError: pass return u'&%s;' % entity if encoding is None and isinstance(data, str): try: data = data.decode('utf-8') except UnicodeDecodeError: print 'data encoding is not unicode neither utf-8' return '' data = re.sub(r'&([a-z]+);', entity_callback, data) data = re.sub(r'&#(\d+);', unicode_char_callback, data) return data