coding utf-8 import re import htmlentitydefs re_tag re compile re def

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
import re
import htmlentitydefs
re_tag = re.compile(r'<[^>]+>', re.S)
def strip_tags(data):
"""Remove all tags from text"""
return re_tag.sub(' ', data)
def decode_entities(data, encoding=None):
"""Decode things like &nbsp; to normal text"""
def unicode_char_callback(match):
code = match.group(1)
try:
value = unichr(int(code))
except ValueError:
value = code
return value
def entity_callback(match):
entity = match.group(1)
try:
value = htmlentitydefs.name2codepoint[entity]
try:
data = unichr(value)
if encoding:
data = data.encode(encoding)
return data
except UnicodeDecodeError:
pass
except KeyError:
pass
return u'&%s;' % entity
if encoding is None and isinstance(data, str):
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
print 'data encoding is not unicode neither utf-8'
return ''
data = re.sub(r'&([a-z]+);', entity_callback, data)
data = re.sub(r'&#(\d+);', unicode_char_callback, data)
return data