обёртка для pycurl

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import pycurl
import urllib
import copy
import re
from libpy.html import detect_encoding
# TODO:
# WTF is the SIGPIPE ignoring?
# improve cookies support, may be use 3rd part library
#Following piece of code was cuted from somewhere...
#We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
#the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
class Error(pycurl.error):
"""Used to indicate network error. The same as pycrl.errror"""
class SiteError(Error):
"""
Used to indicate error of the remote resource
It is usefull for example when we query server which name can not
be resolved
"""
def get(url,config=None):
"""Simple function for fetching url using grab instance"""
curl = Grab()
curl.setup('url', url)
if config:
curl.setup(config)
curl.run()
return curl.body
class Grab:
"""Fancy wrapper for pycurl library"""
def __init__(self):
self.timeout = 20
self.logFile = None
self.config = {}
self._bodyCallbacks = []
self.debug = False
self.lastError = None
self.freshPostData = False
self.freshCookies = False
self.oldUrl = None
self.debug = False
self.autoCookies = False
self.default_headers = [
'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language: ru,en-us;q=0.7,en;q=0.3',
#'Accept-Encoding: gzip,deflate',
'Accept-Charset: utf-8,windows-1251;q=0.7,*;q=0.7']
self.default_user_agent = 'Mozilla/5.0 (X11; U; Linux i686; ru; rv:1.8.1.5) Gecko/20070806 Firefox/2.0.0.5'
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.unicode = True
self.encoding = None
self.use_tidy = False
def _bodyCallback(self, data):
"""Used to process anser body"""
if self.nobody:
return 0
else:
self.body = self.body + data
if self.maxsize:
if len(self.body) > self.maxsize:
return 0
if self._bodyCallbacks:
for callback in self._bodyCallbacks:
if not callback(data):
return 0
return len(data)
def _headCallback(self, data):
"""Used to process answer headers"""
if self.nohead:
return 0
else:
self.head = self.head + data
return len(data)
def request(self):
"""Run prepared curl request"""
self.curl.perform()
self.curl.close()
def setup(self, name, value = None):
"""
Configure curl request. Arguments variants:
1. name - option name, value - option value
2. name is dictionary, value is None
"""
if isinstance(name, dict):
for key, value in name.items():
self.setup(key, value)
else:
if 'post' == name:
self.freshPostData = True
if 'cookies' == name:
self.freshCookies = True
self.config[name] = value
def _changeState(self, name, value):
"""
Configure internal pycurl instance before request
"""
if isinstance(name, int):
self.curl.setopt(name, value)
# TODO: is it possible that dict passed to changeState?
elif isinstance(name, dict):
for key in name:
self.setup(key, name[key])
if 'post' == name:
if value:
self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
else:
self.curl.setopt(pycurl.HTTPGET,1)
elif 'cookiefile' == name:
self.curl.setopt(pycurl.COOKIEJAR, value)
selft.curl.seopt(pycurl.COOKIEFILE, value)
elif 'logfile' == name:
self.logFile = value
elif 'url' == name:
self.curl.setopt(pycurl.URL, str(value))
elif 'proxy' == name:
#print 'Using proxy: %s' % value
if value:
proxy = value
else:
proxy = ''
self.curl.setopt(pycurl.PROXY, proxy)
elif 'timeout' == name:
self.curl.setopt(pycurl.TIMEOUT, value)
elif 'connect_timeout' == name:
self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
elif 'referer' == name:
self.curl.setopt(pycurl.REFERER, str(value))
elif 'cookies' == name:
self.curl.setopt(pycurl.COOKIE, ''.join((
urllib.quote_plus(a) + '=' + \
urllib.quote_plus(b) + ';'
for a, b in value.items())))
elif 'nobody' == name:
if True == value:
self.nobody = True
elif 'nohead' == name:
if True == value:
self.nohead = True
elif 'maxsize' == name:
self.maxsize = value
elif 'redirect' == name:
self.curl.setopt(pycurl.FOLLOWLOCATION, value)
elif 'userpwd' == name:
self.curl.setopt(pycurl.USERPWD, value)
elif 'bodyCallback' == name:
if isinstance(name, (list, tuple)):
self._bodyCallbacks = value
else:
self._bodyCallbacks.append(value)
elif 'autocookies' == name:
self.autoCookies = value
elif 'user-agent' == name:
self.curl.setopt(pycurl.USERAGENT, value)
elif 'headers' == name:
self.curl.setopt(pycurl.HTTPHEADER, value)
elif 'autoreferer' == name:
if not 'referer' in self.config:
if not self.oldUrl is None:
self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
elif 'unicode' == name:
self.unicode = bool(value)
elif 'use_tidy' == name:
self.use_tidy = bool(value)
else:
raise Exception, "unknown option: %s" % name
def _prepare(self):
"""Prepare for request"""
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
self.curl.setopt(pycurl.TIMEOUT, self.timeout)
self.curl.setopt(pycurl.NOSIGNAL, 1)
self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.maxsize = 0
self.nobody = False
self.nohead = False
self.lastError = ''#pycurl.CURLE_OK
self.encoding = None
if not 'user-agent' in self.config:
self.config['user-agent'] = self.default_user_agent
headers = self.config.setdefault('headers', [])
for header in self.default_headers:
headers.insert(0, header)
for name, value in self.config.items():
self._changeState(name, value)
# If we query new url we must reset old post and cookes information
# if they was not defined for new url becouse their values
# are still stored in the self.config
if self.oldUrl != self.config['url']:
if not self.freshPostData:
self.curl.setopt(pycurl.HTTPGET, 1)
if not self.freshCookies:
self.curl.setopt(pycurl.COOKIE, '')
self.freshPostData = False
self.freshCookies = False
def run(self):
"""Do request"""
self._prepare()
try:
self.curl.perform()
except pycurl.error, err:
# CURLE_WRITE_ERROR
# An error occurred when writing received data to a local file, or
# an error was returned to libcurl from a write callback.
# This is expected error and we should ignore it
if 23 == err[0]:
pass
else:
self._finish()
self.lastError = err
# 6 - could not resolve host
# 47 - too many redirects
# 52 - nothing was returned from the server
# 58 - problem with the local client certificate
# 59 - couldn't use specified cipher
# 60 - problem with the CA cert (path? access rights?)
if err[0] in (6, 47, 52, 58, 59, 60):
raise SiteError, err
raise Error, err
self._finish()
def _finish(self):
"""Process query result"""
self.oldUrl = self.config['url']
if self.maxsize:
self.body = self.body[0:self.maxsize]
if self.logFile:
open(self.logFile, 'w').write(
self.config['url'] + '\n' + \
self.curl.errstr() + '\n' + \
self.head + '\n' + self.body)
for line in re.split('\r?\n', self.head):
try:
name, value = line.split(': ', 1)
if 'Set-Cookie' == name:
match = re.search('^([^=]+)=([^;]+)*', value)
if match:
self.cookies[match.group(1)] = match.group(2)
else:
self.headers[name] = value
except ValueError:
pass
if self.autoCookies:
self.setup('cookies', self.cookies)
if self.unicode:
self.decode_body()
if self.use_tidy:
if not self.unicode:
raise Exception('`use_tidy` options requires `unicode` option but it is off now')
else:
self.apply_tidy()
#self.curl.close()
def decode_body(self):
encoding = detect_encoding(self.body, headers=self.headers)
self.encoding = encoding
if encoding:
self.body = self.body.decode(encoding)
else:
# TODO: choose the proper way for handling case of unknown encoding
raise Exception('Could not determine encoding')
#self.body = self.body.decode('utf-8', 'ignore')
def apply_tidy(self):
import tidy
self.original_body = self.body
data = self.body.encode('utf-8')
options = dict(
output_xhtml=1,
show_body_only=0,
force_output=1,
char_encoding='utf8')
data = str(tidy.parseString(data, **options))
self.body = data.decode('utf-8')
def getinfo(self,key):
return self.curl.getinfo(getattr(pycurl, key))
def errstr(self):
"""get request error text"""
self.curl.errstr()
def getConfig(self, name):
try:
return self.config[name]
except KeyError:
return ''
def code(self):
return self.getinfo('RESPONSE_CODE')