parser for djangobb.org

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from HTMLParser import HTMLParser
from django.template.defaultfilters import urlize as django_urlize
class BaseHTMLParser(HTMLParser):
"""
Decorator for html parasing with excluding specified tags
"""
def __init__(self, func):
HTMLParser.__init__(self)
self.func = func
self.is_ignored = False
self.html = []
def handle_starttag(self, tag, attrs):
self.html.append('<%s%s>' % (tag, self.__html_attrs(attrs)))
if tag in ('a', 'code'):
self.is_ignored = True
def handle_data(self, data):
if not self.is_ignored:
data = self.func(data)
self.html.append(data)
def handle_startendtag(self, tag, attrs):
self.html.append('<%s%s/>' % (tag, self.__html_attrs(attrs)))
def handle_endtag(self, tag):
self.is_link = False
self.html.append('</%s>' % (tag))
def handle_entityref(self, name):
self.html.append('&%s;' % name)
def handle_charref(self, name):
self.html.append('&%s;' % name)
def __html_attrs(self, attrs):
_attrs = ''
if attrs:
_attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in attrs]))
return _attrs
def feed(self, data):
HTMLParser.feed(self, data)
self.html = ''.join(self.html)
def urlize(data):
"""
Urlize plain text links in the HTML contents.
Do not urlize content of A and CODE tags.
"""
parser = BaseHTMLParser(django_urlize)
parser.feed(data)
urlized_html = parser.html
parser.close()
return urlized_html
if __name__ == "__main__":
data = 'test http://ya.ru/ <code>http://ya.ru/</code>'
print urlize(data)