import htmldata from BeautifulSoup import BeautifulSoup safe_tags br s

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import htmldata
from BeautifulSoup import BeautifulSoup
safe_tags = ['br/', 'p', 'strong', 'b', 'i', 'ul', 'ol', 'li',
'dl', 'dt', 'dd', 'table', 'tr', 'td', 'th']
def filter_html(html):
html = unicode(BeautifulSoup(html))
tree = htmldata.tagextract(html)
safe_tree = []
for elem in tree:
if not isinstance(elem, tuple):
safe_tree.append(elem)
else:
tag, args = elem
base_tag = tag.lstrip('/')
if base_tag in safe_tags:
safe_tree.append((tag, {}))
return htmldata.tagjoin(safe_tree)
def test(html):
print 'ORIGIN', html
print 'SAFE', filter_html(html)
test('<b>test</b>')
test('<b>test</b> <a href="http://ya.ru">yandex link</a>')
test('<body><a href="http://google.com" rel="abcd"><strong>google search</strong> yeh</a>')