from htmlentitydefs import name2codepoint import re def unescape data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
from htmlentitydefs import name2codepoint
import re
def unescape(data):
"""Convert html entitydefs into unicode characters
"""
chunks = re.split('&(#?\w+);',data)
for i in range(1,len(chunks),2):
if chunks[i] in name2codepoint:
chunks[i] = unichr(name2codepoint[chunks[i]]).encode('utf8')
elif re.match('#\d+$',chunks[i]):
chunks[i] = unichr(int(chunks[i][1:])).encode('utf8')
return ''.join(chunks)
def tag_contents_as_string(node):
return unescape(''.join([str(e) for e in node.contents]))