pycurl library wrapper: grab.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
import pycurl
import urllib
import copy
import re
import signal
import os
import random
from urlparse import urlsplit
from libpy.html import detect_encoding
# TODO:
# fetching the binary content even with unicode=True fails - PIL couldn't load png file
# fetched with Grab
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
except ValueError:
# Do this to ignore ValueError: signal only works in main thread
# in python 2.5 WTF???
pass
class Error(pycurl.error):
"""Used to indicate network error. The same as pycrl.errror"""
class SiteError(Error):
"""
Used to indicate error of the remote resource
It is usefull for example when we query server which name can not
be resolved
"""
def get(url, config=None, soup=False):
"""Simple function for fetching url using grab instance"""
curl = Grab()
curl.setup('url', url)
if config:
curl.setup(config)
curl.run()
if soup:
return curl.soup
else:
return curl.body
class Grab:
"""Fancy wrapper for pycurl library"""
def __init__(self):
self.timeout = 20
self.logFile = None
self.config = {}
self._bodyCallbacks = []
self.debug = False
self.lastError = None
self.freshPostData = False
self.cookies_map = {}
self.oldUrl = None
self.debug = False
self.auto_cookies = False
self.generate_client_profile()
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.unicode = True
self.encoding = None
self.use_tidy = False
self.out_headers = None
self.max_redirects = 5
def generate_client_profile(self):
self.default_headers = {
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9),
'lang': random.choice(['ua', 'gb', 'uk'])},
#'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)},
'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)}
}
#print self.default_headers
self.default_user_agent = random.choice(useragents)
#print self.default_user_agent
def _bodyCallback(self, data):
"""Used to process anser body"""
if self.nobody:
return 0
else:
self.body = self.body + data
if self.maxsize:
if len(self.body) > self.maxsize:
return 0
if self._bodyCallbacks:
for callback in self._bodyCallbacks:
if not callback(data):
return 0
return len(data)
def _headCallback(self, data):
"""Used to process answer headers"""
if self.nohead:
return 0
else:
self.head = self.head + data
return len(data)
def _debug_callback(self, type, data):
if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid
self.out_headers = data
def request(self):
"""Run prepared curl request"""
self.curl.perform()
self.curl.close()
def setup(self, name, value = None):
"""
Configure curl request. Arguments variants:
1. name - option name, value - option value
2. name is dictionary, value is None
"""
if isinstance(name, dict):
for key, value in name.items():
self.setup(key, value)
else:
if 'post' == name:
self.freshPostData = True
self.config[name] = value
def _changeState(self, name, value):
"""
Configure internal pycurl instance before request
"""
if isinstance(name, int):
self.curl.setopt(name, value)
# TODO: is it possible that dict passed to changeState?
elif isinstance(name, dict):
for key in name:
self.setup(key, name[key])
if 'post' == name:
if value:
self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
else:
self.curl.setopt(pycurl.HTTPGET,1)
elif 'logfile' == name:
self.logFile = value
elif 'url' == name:
self.curl.setopt(pycurl.URL, str(value))
elif 'proxy' == name:
if value:
proxy = value
else:
proxy = ''
self.curl.setopt(pycurl.PROXY, proxy)
elif 'timeout' == name:
self.curl.setopt(pycurl.TIMEOUT, value)
elif 'connect_timeout' == name:
self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
elif 'referer' == name:
self.curl.setopt(pycurl.REFERER, str(value))
elif 'cookies' == name:
for name, value in value.items():
self.register_cookie(name, value)
elif 'autocookies' == name:
pass
elif 'nobody' == name:
if True == value:
self.nobody = True
elif 'nohead' == name:
if True == value:
self.nohead = True
elif 'maxsize' == name:
self.maxsize = value
elif 'redirect' == name:
self.curl.setopt(pycurl.FOLLOWLOCATION, value)
elif 'max_redirects' == name:
self.curl.setopt(pycurl.MAXREDIRS, value)
elif 'userpwd' == name:
self.curl.setopt(pycurl.USERPWD, value)
elif 'bodyCallback' == name:
if isinstance(name, (list, tuple)):
self._bodyCallbacks = value
else:
self._bodyCallbacks.append(value)
elif 'user_agent' == name:
self.curl.setopt(pycurl.USERAGENT, value)
elif 'headers' == name:
self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()])
elif 'autoreferer' == name:
if not 'referer' in self.config:
if not self.oldUrl is None:
self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
elif 'unicode' == name:
self.unicode = bool(value)
elif 'use_tidy' == name:
self.use_tidy = bool(value)
elif 'gzip' == name:
self.gzip = value
elif 'debug' == name:
self.curl.setopt(pycurl.VERBOSE, value)
else:
raise Exception, "unknown option: %s" % name
def _prepare(self):
"""Prepare for request"""
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
self.curl.setopt(pycurl.TIMEOUT, self.timeout)
self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout)
self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects)
self.curl.setopt(pycurl.NOSIGNAL, 1)
self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback)
#self.curl.setopt(pycurl.VERBOSE, True)
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.maxsize = 0
self.nobody = False
self.nohead = False
self.lastError = ''#pycurl.CURLE_OK
self.encoding = None
if not 'user_agent' in self.config:
self.config['user_agent'] = self.default_user_agent
# Set up default headers if they do not exist
headers = self.config.setdefault('headers', {})
for header, value in self.default_headers.iteritems():
if not header in headers:
headers[header] = value
if self.config.get('gzip'):
if not header in headers:
headers['Accept-Encoding'] = 'gzip'
for name, value in self.config.items():
self._changeState(name, value)
# If autocookies mode is enabled then use all registered cookies for this domain
# else use cookies given in setup calls (if any)
cookies = ()
if self.config.get('autocookies'):
cookies = self.get_registered_cookies()
elif self.config.get('cookies'):
cookies = self.config['cookies']
if cookies:
parts = []
for name, value in cookies.iteritems():
parts.append('%s=%s;' % (urllib.quote_plus(name),
urllib.quote_plus(value)))
self.curl.setopt(pycurl.COOKIE, ''.join(parts))
# If we query new url we must reset old post and cookes information
# if they was not defined for new url becouse their values
# are still stored in the self.config
if self.oldUrl != self.config['url']:
if not self.freshPostData:
self.curl.setopt(pycurl.HTTPGET, 1)
self.freshPostData = False
def run(self):
"""Do request"""
self._prepare()
try:
self.curl.perform()
except pycurl.error, err:
# CURLE_WRITE_ERROR
# An error occurred when writing received data to a local file, or
# an error was returned to libcurl from a write callback.
# This is expected error and we should ignore it
if 23 == err[0]:
pass
else:
self._finish()
self.lastError = err
# 6 - could not resolve host
# 47 - too many redirects
# 52 - nothing was returned from the server
# 58 - problem with the local client certificate
# 59 - couldn't use specified cipher
# 60 - problem with the CA cert (path? access rights?)
if err[0] in (6, 47, 52, 58, 59, 60):
raise SiteError, err
raise Error, err
self._finish()
def _finish(self):
"""Process query result"""
self.oldUrl = self.config['url']
if self.maxsize:
self.body = self.body[0:self.maxsize]
if self.logFile:
open(self.logFile, 'w').write(
self.config['url'] + '\n' + \
self.curl.errstr() + '\n' + \
self.head + '\n' + self.body)
for line in re.split('\r?\n', self.head):
try:
name, value = line.split(': ', 1)
if 'Set-Cookie' == name:
match = re.search('^([^=]+)=([^;]+)*', value)
if match:
self.cookies[match.group(1)] = match.group(2)
else:
self.headers[name] = value
except ValueError:
pass
for name, value in self.cookies.iteritems():
self.register_cookie(name, value)
if self.headers.get('Content-Encoding') == 'gzip':
import StringIO
import gzip
gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body))
self.body = gzipper.read()
if self.unicode:
self.decode_body()
if self.use_tidy:
if not self.unicode:
raise Exception('`use_tidy` options requires `unicode` option but it is off now')
else:
self.apply_tidy()
#self.curl.close()
def decode_body(self):
encoding = detect_encoding(self.body, headers=self.headers)
self.encoding = encoding
if encoding:
self.body = self.body.decode(encoding)
else:
# TODO: choose the proper way for handling case of unknown encoding
raise Exception('Could not determine encoding')
#self.body = self.body.decode('utf-8', 'ignore')
def apply_tidy(self):
print 'fuck'
import tidy
self.original_body = self.body
data = self.body.encode('utf-8')
options = dict(
output_xhtml=1,
show_body_only=0,
force_output=1,
char_encoding='utf8')
data = str(tidy.parseString(data, **options))
self.body = data.decode('utf-8')
def getinfo(self,key):
return self.curl.getinfo(getattr(pycurl, key))
def errstr(self):
"""get request error text"""
self.curl.errstr()
def getConfig(self, name):
try:
return self.config[name]
except KeyError:
return ''
def code(self):
return self.getinfo('RESPONSE_CODE')
def get_current_host(self):
domain = urlsplit(self.config['url'])[1]
host = domain.rsplit('.', 1)[-1]
return host
def register_cookie(self, name, value):
self.cookies_map.setdefault(self.get_current_host(), {})[name] = value
def get_registered_cookies(self):
return self.cookies_map.get(self.get_current_host(), {})
@property
def soup(self):
from BeautifulSoup import BeautifulSoup
return BeautifulSoup(self.body)
useragents = (
'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511',
'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1',
'Opera/8.0 (X11; Linux i686; U; cs)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50',
'Opera/8.51 (Windows NT 5.1; U; en)',
'Opera/9.0 (Windows NT 5.1; U; en)',
'Opera/9.01 (X11; Linux i686; U; en)',
'Opera/9.02 (Windows NT 5.1; U; en)',
'Opera/9.10 (Windows NT 5.1; U; en)',
'Opera/9.23 (Windows NT 5.1; U; ru)',
)