import htmldata from BeautifulSoup import BeautifulSoup safe_tag abbr

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import htmldata
from BeautifulSoup import BeautifulSoup
safe_tag = ['abbr', 'acronym', 'address', 'b', 'big', 'blockquote', 'br', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul']
safe_attr = {'img': ['src', 'alt'],
'object': ['width', 'height'],
'param': ['name', 'value'],
'embed': ['src', 'type', 'wmode', 'width', 'height'],
}
# Realworld_youtube_example:
YOUTUBE_CODE = """<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/y7SCIoYGwE8&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/y7SCIoYGwE8&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>"""
def clean_html(html):
#html = unicode(BeautifulSoup(html))
tree = htmldata.tagextract(html)
safe_tree = []
for elem in tree:
if not isinstance(elem, tuple):
safe_tree.append(elem)
else:
tag, attrs = elem
base_tag = tag.strip('/')
if base_tag in safe_tag:
new_attrs = dict((x, y) for x, y in attrs.iteritems()
if x in safe_attr.get(base_tag, []))
safe_tree.append((tag, new_attrs))
return htmldata.tagjoin(safe_tree)
if __name__ == '__main__':
def test(html):
print 'ORIGIN', html
print 'SAFE', clean_html(html)
test('<b>test</b>')
test('<b>test</b> <a href="http://ya.ru">yandex link</a>')
test('<body><a href="http://google.com" rel="abcd"><strong>google search</strong> yeh</a>')
test('<img src="link"> text... <strong><img src="link" rel="abc" />')
test(YOUTUBE_CODE)