html processing utilities

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
import re
import htmlentitydefs
re_tag = re.compile(r'<[^>]+>', re.S)
def strip_tags(data):
"""Remove all tags from text"""
return re_tag.sub(' ', data)
def decode_entities(data, encoding=None):
"""Decode things like &nbsp; to normal text"""
def unicode_char_callback(match):
code = match.group(1)
try:
value = unichr(int(code))
except ValueError:
value = code
return value
def entity_callback(match):
entity = match.group(1)
try:
value = htmlentitydefs.name2codepoint[entity]
try:
data = unichr(value)
if encoding:
data = data.encode(encoding)
return data
except UnicodeDecodeError:
pass
except KeyError:
pass
return u'&%s;' % entity
if encoding is None and isinstance(data, str):
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
print 'data encoding is not unicode neither utf-8'
return ''
data = re.sub(r'&([a-z]+);', entity_callback, data)
data = re.sub(r'&#(\d+);', unicode_char_callback, data)
return data
#def decode_html(data):
#"""Do some thins to make nice html"""
#data = decode_entities(data)
#return data
def html2text(data):
"""Make text from html"""
re_br = re.compile(r'<br\s*/?>', re.I)
data = re_br.sub('\n', data)
data = strip_tags(data)
data = decode_entities(data)
data = data.strip()
return data
def detect_encoding(data, encoding=None, curl=None, headers=None):
try_encodings = []
if encoding:
try_encodings.append(encoding)
ct_header = None
if headers:
ct_header = headers.get('Content-Type', '')
elif curl:
ct_header = curl.headers.get('Content-Type', '')
if ct_header:
match = re.search(r'; encoding=([^" ]+)', ct_header, re.S)
if match and math.group(1):
try_encodings.insert(0, match.group(1))
# extract meta encoding
match = re.search(r'; charset=([^" ]+)', data, re.S)
if match and match.group(1):
try_encodings.append(match.group(1))
# try to use chardet
#enc = chardet.detect(data)['encoding']
#if enc:
#try_encodings.add(enc)
try_encodings.extend(['windows-1251', 'koi8-r', 'utf-8'])
for encoding in try_encodings:
try:
data.decode(encoding)
return encoding
except:
pass
return None