quot 123 0123 def convert_entity entity if entity -1 in name2codepoint

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# '"' -> u'"', '{' -> U+0123
def convert_entity(entity):
if entity[1:-1] in name2codepoint:
return unichr(name2codepoint[entity[1:-1]])
elif re.match('&#\d+;$', entity):
return unichr(int(entity[2:-1]))
return entity
# [u'foo', u'"', u'bar', u'"', ...] -> [u'foo', u'"', u'bar', u'"', ...]
def unescape_htmlentities_list(entities):
for i, entity in enumerate(entities):
if (i%2) == 0:
yield entity
else:
yield convert_entity(entity)
def unescape_htmlentities(data):
"""
Convert html entitydefs into unicode characters
"""
chunks = re.split('(&#?\w+;)',data)
return u''.join(unescape_htmlentities_list(chunks))
def tag_contents_as_string(node):
return u''.join([unicode(e) for e in node.contents])