import pycurl
import urllib
import copy
import re
import signal
import os
import random
from urlparse import urlsplit
from libpy.html import detect_encoding
# TODO:
# fetching the binary content even with unicode=True fails - PIL couldn't load png file
# fetched with Grab
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
# Comments: http://curl.haxx.se/mail/curlpython-2005-06/0004.html
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
except ValueError:
# Do this to ignore ValueError: signal only works in main thread
# in python 2.5 WTF???
pass
class Error(pycurl.error):
"""Used to indicate network error. The same as pycrl.errror"""
class SiteError(Error):
"""
Used to indicate error of the remote resource
It is usefull for example when we query server which name can not
be resolved
"""
def get(url, config=None, soup=False):
"""Simple function for fetching url using grab instance"""
curl = Grab()
curl.setup('url', url)
if config:
curl.setup(config)
curl.run()
if soup:
return curl.soup
else:
return curl.body
class Grab:
"""Fancy wrapper for pycurl library"""
def __init__(self):
self.timeout = 20
self.logFile = None
self.config = {}
self._bodyCallbacks = []
self.debug = False
self.lastError = None
self.freshPostData = False
self.cookies_map = {}
self.oldUrl = None
self.debug = False
self.auto_cookies = False
self.generate_client_profile()
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.unicode = True
self.encoding = None
self.use_tidy = False
self.out_headers = None
self.max_redirects = 5
def generate_client_profile(self):
self.default_headers = {
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'ru,en-us;q=0.%(x)d,en;q=0.3;%(lang)s' % {'x': random.randint(5, 9),
'lang': random.choice(['ua', 'gb', 'uk'])},
#'Accept-Encoding': 'gzip,compress;q=0.%(x)d,deflate;q=0' % {'x': random.randint(5, 9)},
'Accept-Charset': 'utf-8,windows-1251;q=0.%(x)d,*;q=0.%(x)d' % {'x': random.randint(5, 9)}
}
#print self.default_headers
self.default_user_agent = random.choice(useragents)
#print self.default_user_agent
def _bodyCallback(self, data):
"""Used to process anser body"""
if self.nobody:
return 0
else:
self.body = self.body + data
if self.maxsize:
if len(self.body) > self.maxsize:
return 0
if self._bodyCallbacks:
for callback in self._bodyCallbacks:
if not callback(data):
return 0
return len(data)
def _headCallback(self, data):
"""Used to process answer headers"""
if self.nohead:
return 0
else:
self.head = self.head + data
return len(data)
def _debug_callback(self, type, data):
if type == 2: # pycurl.CURLINFO_HEADER_OUT: WTF?? pycurl.HEADER_OUT is invalid
self.out_headers = data
def request(self):
"""Run prepared curl request"""
self.curl.perform()
self.curl.close()
def setup(self, name, value = None):
"""
Configure curl request. Arguments variants:
1. name - option name, value - option value
2. name is dictionary, value is None
"""
if isinstance(name, dict):
for key, value in name.items():
self.setup(key, value)
else:
if 'post' == name:
self.freshPostData = True
self.config[name] = value
def _changeState(self, name, value):
"""
Configure internal pycurl instance before request
"""
if isinstance(name, int):
self.curl.setopt(name, value)
# TODO: is it possible that dict passed to changeState?
elif isinstance(name, dict):
for key in name:
self.setup(key, name[key])
if 'post' == name:
if value:
self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(value))
else:
self.curl.setopt(pycurl.HTTPGET,1)
elif 'logfile' == name:
self.logFile = value
elif 'url' == name:
self.curl.setopt(pycurl.URL, str(value))
elif 'proxy' == name:
if value:
proxy = value
else:
proxy = ''
self.curl.setopt(pycurl.PROXY, proxy)
elif 'timeout' == name:
self.curl.setopt(pycurl.TIMEOUT, value)
elif 'connect_timeout' == name:
self.curl.setopt(pycurl.CONNECTTIMEOUT, value)
elif 'referer' == name:
self.curl.setopt(pycurl.REFERER, str(value))
elif 'cookies' == name:
for name, value in value.items():
self.register_cookie(name, value)
elif 'autocookies' == name:
pass
elif 'nobody' == name:
if True == value:
self.nobody = True
elif 'nohead' == name:
if True == value:
self.nohead = True
elif 'maxsize' == name:
self.maxsize = value
elif 'redirect' == name:
self.curl.setopt(pycurl.FOLLOWLOCATION, value)
elif 'max_redirects' == name:
self.curl.setopt(pycurl.MAXREDIRS, value)
elif 'userpwd' == name:
self.curl.setopt(pycurl.USERPWD, value)
elif 'bodyCallback' == name:
if isinstance(name, (list, tuple)):
self._bodyCallbacks = value
else:
self._bodyCallbacks.append(value)
elif 'user_agent' == name:
self.curl.setopt(pycurl.USERAGENT, value)
elif 'headers' == name:
self.curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (a, b) for a, b in value.iteritems()])
elif 'autoreferer' == name:
if not 'referer' in self.config:
if not self.oldUrl is None:
self.curl.setopt(pycurl.REFERER, str(self.oldUrl))
elif 'unicode' == name:
self.unicode = bool(value)
elif 'use_tidy' == name:
self.use_tidy = bool(value)
elif 'gzip' == name:
self.gzip = value
elif 'debug' == name:
self.curl.setopt(pycurl.VERBOSE, value)
else:
raise Exception, "unknown option: %s" % name
def _prepare(self):
"""Prepare for request"""
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
self.curl.setopt(pycurl.TIMEOUT, self.timeout)
self.curl.setopt(pycurl.CONNECTTIMEOUT, self.timeout)
self.curl.setopt(pycurl.MAXREDIRS, self.max_redirects)
self.curl.setopt(pycurl.NOSIGNAL, 1)
self.curl.setopt(pycurl.WRITEFUNCTION, self._bodyCallback)
self.curl.setopt(pycurl.HEADERFUNCTION, self._headCallback)
self.curl.setopt(pycurl.DEBUGFUNCTION, self._debug_callback)
#self.curl.setopt(pycurl.VERBOSE, True)
self.head = ''
self.body = ''
self.headers = {}
self.cookies = {}
self.maxsize = 0
self.nobody = False
self.nohead = False
self.lastError = ''#pycurl.CURLE_OK
self.encoding = None
if not 'user_agent' in self.config:
self.config['user_agent'] = self.default_user_agent
# Set up default headers if they do not exist
headers = self.config.setdefault('headers', {})
for header, value in self.default_headers.iteritems():
if not header in headers:
headers[header] = value
if self.config.get('gzip'):
if not header in headers:
headers['Accept-Encoding'] = 'gzip'
for name, value in self.config.items():
self._changeState(name, value)
# If autocookies mode is enabled then use all registered cookies for this domain
# else use cookies given in setup calls (if any)
cookies = ()
if self.config.get('autocookies'):
cookies = self.get_registered_cookies()
elif self.config.get('cookies'):
cookies = self.config['cookies']
if cookies:
parts = []
for name, value in cookies.iteritems():
parts.append('%s=%s;' % (urllib.quote_plus(name),
urllib.quote_plus(value)))
self.curl.setopt(pycurl.COOKIE, ''.join(parts))
# If we query new url we must reset old post and cookes information
# if they was not defined for new url becouse their values
# are still stored in the self.config
if self.oldUrl != self.config['url']:
if not self.freshPostData:
self.curl.setopt(pycurl.HTTPGET, 1)
self.freshPostData = False
def run(self):
"""Do request"""
self._prepare()
try:
self.curl.perform()
except pycurl.error, err:
# CURLE_WRITE_ERROR
# An error occurred when writing received data to a local file, or
# an error was returned to libcurl from a write callback.
# This is expected error and we should ignore it
if 23 == err[0]:
pass
else:
self._finish()
self.lastError = err
# 6 - could not resolve host
# 47 - too many redirects
# 52 - nothing was returned from the server
# 58 - problem with the local client certificate
# 59 - couldn't use specified cipher
# 60 - problem with the CA cert (path? access rights?)
if err[0] in (6, 47, 52, 58, 59, 60):
raise SiteError, err
raise Error, err
self._finish()
def _finish(self):
"""Process query result"""
self.oldUrl = self.config['url']
if self.maxsize:
self.body = self.body[0:self.maxsize]
if self.logFile:
open(self.logFile, 'w').write(
self.config['url'] + '\n' + \
self.curl.errstr() + '\n' + \
self.head + '\n' + self.body)
for line in re.split('\r?\n', self.head):
try:
name, value = line.split(': ', 1)
if 'Set-Cookie' == name:
match = re.search('^([^=]+)=([^;]+)*', value)
if match:
self.cookies[match.group(1)] = match.group(2)
else:
self.headers[name] = value
except ValueError:
pass
for name, value in self.cookies.iteritems():
self.register_cookie(name, value)
if self.headers.get('Content-Encoding') == 'gzip':
import StringIO
import gzip
gzipper = gzip.GzipFile(fileobj=StringIO.StringIO(self.body))
self.body = gzipper.read()
if self.unicode:
self.decode_body()
if self.use_tidy:
if not self.unicode:
raise Exception('`use_tidy` options requires `unicode` option but it is off now')
else:
self.apply_tidy()
#self.curl.close()
def decode_body(self):
encoding = detect_encoding(self.body, headers=self.headers)
self.encoding = encoding
if encoding:
self.body = self.body.decode(encoding)
else:
# TODO: choose the proper way for handling case of unknown encoding
raise Exception('Could not determine encoding')
#self.body = self.body.decode('utf-8', 'ignore')
def apply_tidy(self):
print 'fuck'
import tidy
self.original_body = self.body
data = self.body.encode('utf-8')
options = dict(
output_xhtml=1,
show_body_only=0,
force_output=1,
char_encoding='utf8')
data = str(tidy.parseString(data, **options))
self.body = data.decode('utf-8')
def getinfo(self,key):
return self.curl.getinfo(getattr(pycurl, key))
def errstr(self):
"""get request error text"""
self.curl.errstr()
def getConfig(self, name):
try:
return self.config[name]
except KeyError:
return ''
def code(self):
return self.getinfo('RESPONSE_CODE')
def get_current_host(self):
domain = urlsplit(self.config['url'])[1]
host = domain.rsplit('.', 1)[-1]
return host
def register_cookie(self, name, value):
self.cookies_map.setdefault(self.get_current_host(), {})[name] = value
def get_registered_cookies(self):
return self.cookies_map.get(self.get_current_host(), {})
@property
def soup(self):
from BeautifulSoup import BeautifulSoup
return BeautifulSoup(self.body)
useragents = (
'Mozilla/4.0 (compatible; MSIE 6.0; MSN 2.5; Windows 98)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Win32)',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; Arcor 5.005; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; YPC 3.0.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.8) Gecko/20050511',
'Mozilla/5.0 (X11; U; Linux i686; cs-CZ; rv:1.7.12) Gecko/20050929',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.7.8) Gecko/20050609 Firefox/1.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8) Gecko/20051107 Firefox/1.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.6) Gecko/20060808 Fedora/1.5.0.6-2.fc5 Firefox/1.5.0.6 pango-text',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1) Gecko/20060601 Firefox/2.0 (Ubuntu-edgy)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070220 Firefox/2.0.0.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20070221 SUSE/2.0.0.2-6.1 Firefox/2.0.0.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
'Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US; rv:1.9a1) Gecko/20061204 GranParadiso/3.0a1',
'Opera/8.0 (X11; Linux i686; U; cs)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50',
'Mozilla/5.0 (Windows NT 5.1; U; en) Opera 8.50',
'Opera/8.51 (Windows NT 5.1; U; en)',
'Opera/9.0 (Windows NT 5.1; U; en)',
'Opera/9.01 (X11; Linux i686; U; en)',
'Opera/9.02 (Windows NT 5.1; U; en)',
'Opera/9.10 (Windows NT 5.1; U; en)',
'Opera/9.23 (Windows NT 5.1; U; ru)',
)