def getMeta self global detector title keywords descriptions data self

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def getMeta(self):
global detector
title = keywords = descriptions = ''
data = self.data
try:
codepage=re.findall("""charset=([a-zA-Z0-9\-]+)""", self.CONTENT_TYPE)
if codepage:
data=data.decode(codepage[0])
else:
codepage=re.findall("""charset=([a-zA-Z0-9\-]+)""", data)
if codepage:
data=data.decode(codepage[0])
else:
detector.feed(data)
detector.reset()
data=data.decode(detector.result["encoding"])
except:
try:
detector.feed(data)
detector.reset()
data=data.decode(detector.result["encoding"])
except:
cursor.execute("""delete from getmymeta where id=%s""",self.urserData['ID'])
print type(data)
try:
html = fromstring(data)
except:
print data
sys.exit()
results=html.xpath('//title')
if results:
title = results[0].text_content().title()
results=html.xpath('//meta[@name="keywords"]')
if results:
keywords = results[0].get('content')
results=html.xpath('//meta[@name="description"]')
if results:
descriptions = results[0].get('content')
title=title.encode('utf-8')
keywords=keywords.encode('utf-8')
descriptions=descriptions.encode('utf-8')
cursor.execute("update getmymeta set title=%s, keywords=%s, description=%s, iswork='1' where id=%s", (title,keywords,descriptions,self.urserData['ID']))
return
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
<type 'unicode'>
Traceback (most recent call last):
File "./parser.py", line 98, in ?
mcurl.fetch()
File "/var/www/wuoru/wuo.ru/www/mcurl.py", line 151, in fetch
c.poolData['CALLBACK'](c)
File "./parser.py", line 69, in getMeta
html = fromstring(data)
File "/usr/lib/python2.4/site-packages/lxml-2.2-py2.4-linux-i686.egg/lxml/html/__init__.py", line 603, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/lib/python2.4/site-packages/lxml-2.2-py2.4-linux-i686.egg/lxml/html/__init__.py", line 511, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2534, in lxml.etree.fromstring (src/lxml/lxml.etree.c:51135)
File "parser.pxi", line 1514, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:76076)
ValueError: Unicode strings with encoding declaration are not supported.