import pycurl import urllib import copy import re from libpy.html import detect_encoding # TODO: # WTF is the SIGPIPE ignoring? # improve cookies support, may be use 3rd part library #Following piece of code was cuted from somewhere... #We should ignore SIGPIPE when using pycurl.NOSIGNAL - see #the libcurl tutorial for more info. try: import signal from signal import SIGPIPE, SIG_IGN signal.signal(signal.SIGPIPE, signal.SIG_IGN) except ImportError: pass class Error(pycurl.error): """Used to indicate network error. The same as pycrl.errror""" class SiteError(Error): """ Used to indicate error of the remote resource It is usefull for example when we query server which name can not be resolved """ def get(url,config=None): """Simple function for fetching url using grab instance""" curl = Grab() curl.setup('url', url) if config: curl.setup(config) curl.run() return curl.body class Grab: """Fancy wrapper for pycurl library""" def __init__(self): self.timeout = 20 self.logFile = None self.config = {} self._bodyCallbacks = [] self.debug = False self.lastError = None self.freshPostData = False self.freshCookies = False self.oldUrl = None self.debug = False self.autoCookies = False self.default_headers = [ 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language: ru,en-us;q=0.7,en;q=0.3', #'Accept-Encoding: gzip,deflate', 'Accept-Charset: utf-8,windows-1251;q=0.7,*;q=0.7'] self.default_user_agent = 'Mozilla/5.0 (X11; U; Linux i686; ru; rv:1.8.1.5) Gecko/20070806 Firefox/2.0.0.5' self.head = '' self.body = '' self.headers = {} self.cookies = {} self.unicode = True self.encoding = None self.use_tidy = False def _bodyCallback(self, data): """Used to process anser body""" if self.nobody: return 0 else: self.body = self.body + data if self.maxsize: if len(self.body) > self.maxsize: return 0 if self._bodyCallbacks: for callback in self._bodyCallbacks: if not callback(data): return 0 return len(data) def _headCallback(self, data): """Used to process answer headers""" if self.nohead: return 0 else: self.head = self.head + data return len(data) def request(self): """Run prepared curl request""" self.curl.perform() self.curl.close() def setup(self, name, value = None): """ Configure curl request. Arguments variants: 1. name - option name, value - option value 2. name is dictionary, value is None """ if isinstance(name, dict): for key, value in name.items(): self.setup(key, value) else: if 'post' == name: self.freshPostData = True if 'cookies' == name: self.freshCookies = True self.config[name] = value def _changeState(self, name, value): """ Configure internal pycurl instance before request """ if isinstance(name, int): self.curl.setopt(name, value) # TODO: is it possible that dict passed to changeState? elif isinstance(name, dict): for key in name: self.setup(key, name[key]) if 'post' == name: if value: self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value)) else: self.curl.setopt(pycurl.HTTPGET,1) elif 'cookiefile' == name: self.curl.setopt(pycurl.COOKIEJAR, value) selft.curl.seopt(pycurl.COOKIEFILE, value) elif 'logfile' == name: self.logFile = value elif 'url' == name: self.curl.setopt(pycurl.URL, str(value)) elif 'proxy' == name: #print 'Using proxy: %s' % value if value: proxy = value else: proxy = '' self.curl.setopt(pycurl.PROXY, proxy) elif 'timeout' == name: self.curl.setopt(pycurl.TIMEOUT, value) elif 'connect_timeout' == name: self.curl.setopt(pycurl.CONNECTTIMEOUT, value) elif 'referer' == name: self.curl.setopt(pycurl.REFERER, str(value)) elif 'cookies' == name: self.curl.setopt(pycurl.COOKIE, ''.join(( urllib.quote_plus(a) + '=' + \ urllib.quote_plus(b) + ';' for a, b in value.items()))) elif 'nobody' == name: if True == value: self.nobody = True elif 'nohead' == name: if True == value: self.nohead = True elif 'maxsize' == name: self.maxsize = value elif 'redirect' == name: self.curl.setopt(pycurl.FOLLOWLOCATION, value) elif 'userpwd' == name: self.curl.setopt(pycurl.USERPWD, value) elif 'bodyCallback' == name: if isinstance(name, (list, tuple)): self._bodyCallbacks = value else: self._bodyCallbacks.append(value) elif 'autocookies' == name: self.autoCookies = value elif 'user-agent' == name: self.curl.setopt(pycurl.USERAGENT, value) elif 'headers' == name: self.curl.setopt(pycurl.HTTPHEADER, value) elif 'autoreferer' == name: if not 'referer' in self.config: if not self.oldUrl is None: self.curl.setopt(pycurl.REFERER, str(self.oldUrl)) elif 'unicode' == name: self.unicode = bool(value) elif 'use_tidy' == name: self.use_tidy = bool(value) else: raise Exception, "unknown option: %s" % name def _prepare(self): """Prepare for request""" self.curl = pycurl.Curl() self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) self.curl.setopt(pycurl.FOLLOWLOCATION, 1) self.curl.setopt(pycurl.TIMEOUT, self.timeout) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback) self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback) self.head = '' self.body = '' self.headers = {} self.cookies = {} self.maxsize = 0 self.nobody = False self.nohead = False self.lastError = ''#pycurl.CURLE_OK self.encoding = None if not 'user-agent' in self.config: self.config['user-agent'] = self.default_user_agent headers = self.config.setdefault('headers', []) for header in self.default_headers: headers.insert(0, header) for name, value in self.config.items(): self._changeState(name, value) # If we query new url we must reset old post and cookes information # if they was not defined for new url becouse their values # are still stored in the self.config if self.oldUrl != self.config['url']: if not self.freshPostData: self.curl.setopt(pycurl.HTTPGET, 1) if not self.freshCookies: self.curl.setopt(pycurl.COOKIE, '') self.freshPostData = False self.freshCookies = False def run(self): """Do request""" self._prepare() try: self.curl.perform() except pycurl.error, err: # CURLE_WRITE_ERROR # An error occurred when writing received data to a local file, or # an error was returned to libcurl from a write callback. # This is expected error and we should ignore it if 23 == err[0]: pass else: self._finish() self.lastError = err # 6 - could not resolve host # 47 - too many redirects # 52 - nothing was returned from the server # 58 - problem with the local client certificate # 59 - couldn't use specified cipher # 60 - problem with the CA cert (path? access rights?) if err[0] in (6, 47, 52, 58, 59, 60): raise SiteError, err raise Error, err self._finish() def _finish(self): """Process query result""" self.oldUrl = self.config['url'] if self.maxsize: self.body = self.body[0:self.maxsize] if self.logFile: open(self.logFile, 'w').write( self.config['url'] + '\n' + \ self.curl.errstr() + '\n' + \ self.head + '\n' + self.body) for line in re.split('\r?\n', self.head): try: name, value = line.split(': ', 1) if 'Set-Cookie' == name: match = re.search('^([^=]+)=([^;]+)*', value) if match: self.cookies[match.group(1)] = match.group(2) else: self.headers[name] = value except ValueError: pass if self.autoCookies: self.setup('cookies', self.cookies) if self.unicode: self.decode_body() if self.use_tidy: if not self.unicode: raise Exception('`use_tidy` options requires `unicode` option but it is off now') else: self.apply_tidy() #self.curl.close() def decode_body(self): encoding = detect_encoding(self.body, headers=self.headers) self.encoding = encoding if encoding: self.body = self.body.decode(encoding) else: # TODO: choose the proper way for handling case of unknown encoding raise Exception('Could not determine encoding') #self.body = self.body.decode('utf-8', 'ignore') def apply_tidy(self): import tidy self.original_body = self.body data = self.body.encode('utf-8') options = dict( output_xhtml=1, show_body_only=0, force_output=1, char_encoding='utf8') data = str(tidy.parseString(data, **options)) self.body = data.decode('utf-8') def getinfo(self,key): return self.curl.getinfo(getattr(pycurl, key)) def errstr(self): """get request error text""" self.curl.errstr() def getConfig(self, name): try: return self.config[name] except KeyError: return '' def code(self): return self.getinfo('RESPONSE_CODE')