html2text parser on python

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
p = re.compile('(<p.*?>)|(<tr.*?>)', re.I)
t = re.compile('<td.*?>', re.I)
comm = re.compile('<!--.*?-->', re.M)
tags = re.compile('<.*?>', re.M)
def html2txt(s, hint = 'entity', code = 'ISO-8859-1'):
"""Convert the html to raw txt
- suppress all return
- <p>, <tr> to return
- <td> to tab
Need the foolwing regex:
p = re.compile('(<p.*?>)|(<tr.*?>)', re.I)
t = re.compile('<td.*?>', re.I)
comm = re.compile('<!--.*?-->', re.M)
tags = re.compile('<.*?>', re.M)
version 0.0.1 20020930
"""
s = s.replace('\n', '') # remove returns time this compare to split filter join
s = p.sub('\n', s) # replace p and tr by \n
s = t.sub('\t', s) # replace td by \t
s = comm.sub('', s) # remove comments
s = tags.sub('', s) # remove all remaining tags
s = re.sub(' +', ' ', s) # remove running spaces this remove the \n and \t
# handling of entities
result = s
pass
return result