from HTMLParser import HTMLParser from django.template.defaultfilters import urlize as django_urlize class BaseHTMLParser(HTMLParser): """ Decorator for html parasing with excluding specified tags """ def __init__(self, func): HTMLParser.__init__(self) self.func = func self.is_ignored = False self.html = [] def handle_starttag(self, tag, attrs): self.html.append('<%s%s>' % (tag, self.__html_attrs(attrs))) if tag in ('a', 'code'): self.is_ignored = True def handle_data(self, data): if not self.is_ignored: data = self.func(data) self.html.append(data) def handle_startendtag(self, tag, attrs): self.html.append('<%s%s/>' % (tag, self.__html_attrs(attrs))) def handle_endtag(self, tag): self.is_link = False self.html.append('' % (tag)) def handle_entityref(self, name): self.html.append('&%s;' % name) def handle_charref(self, name): self.html.append('&%s;' % name) def __html_attrs(self, attrs): _attrs = '' if attrs: _attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in attrs])) return _attrs def feed(self, data): HTMLParser.feed(self, data) self.html = ''.join(self.html) def urlize(data): """ Urlize plain text links in the HTML contents. Do not urlize content of A and CODE tags. """ parser = BaseHTMLParser(django_urlize) parser.feed(data) urlized_html = parser.html parser.close() return urlized_html if __name__ == "__main__": data = 'test http://ya.ru/ http://ya.ru/' print urlize(data)