#!usr/bin/env python
#-*-encoding:UTF-8-*-
DEEP = 30
THREADS_COUNT = 10
ENCODING = "UTF-8"
PROCESS_AFTER = 1
REQUESTS_FILE = "requests.txt"
PROXYS_FILE = "proxys.txt"
class HTTP:
def __init__(self):
import urllib2
from urllib import quote
from urllib import unquote
from urllib import urlencode
from urllib import quote_plus
from urllib import unquote_plus
from cookielib import CookieJar
self.__Cookies = CookieJar
self.__Urlencode = urlencode
self.__ProxyHandler = urllib2.ProxyHandler
self.__BuildOpener = urllib2.build_opener
self.__InstallOpener = urllib2.install_opener
self.__HTTPCookieProcessor = urllib2.HTTPCookieProcessor
self.__Open = urllib2.urlopen
self.__Request = urllib2.Request
self.ENCODING = "UTF-8"
self.TIMEOUT = 20
self.HEADERS = {
"User-Agent" : "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1",
"Accept" : "text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1",
"Accept-Language" : "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7",
"Accept-Charset" : "iso-8859-1, utf-8, utf-16, *;q=0.1",
"Accept-Encoding" : "identity, *;q=0",
"Connection" : "Keep-Alive"
}
def make_urlencoded(self, data):
return self.__Urlencode(data)
def make_opener(self, cookies, proxy):
if proxy:
proxy_handler = self.__ProxyHandler(
{
"http": "http://"+proxy+"/",
"https": "http://"+proxy+"/"
}
)
opener = self.__BuildOpener(self.__HTTPCookieProcessor(cookies), proxy_handler)
else:
opener = self.__BuildOpener(self.__HTTPCookieProcessor(cookies))
return opener
def GET(self, URI, cookies=None, proxy=None, headers=None, bytes=None, page_encoding=None, timeout=None):
if page_encoding is None:
page_encoding = self.ENCODING
if timeout is None:
timeout = self.TIMEOUT
if not cookies:
cookies = self.__Cookies()
if headers:
headers_temp = self.HEADERS
for header in headers:
headers_temp[header] = headers[header]
headers = headers_temp
elif not headers:
headers = self.HEADERS
response = {
"status":"",
"cookies":"",
"page":"",
"proxy":""
}
opener = self.make_opener(cookies, proxy)
self.__InstallOpener(opener)
if not bytes:
request = self.__Request(URI, None, headers)
try:
response["page"] = self.__Open(url=request, data=None, timeout=timeout).read().decode(page_encoding, "replace")
response["status"] = "OK"
except Exception, e:
response["status"] = e
else:
request = self.__Request(URI, None, headers)
try:
response["page"] = self.__Open(url=request, data=None, timeout=timeout).read(bytes).decode(page_encoding, "replace")
response["status"] = "OK"
except Exception, e:
response["status"] = e
response["cookies"] = cookies
response["proxy"] = proxy
return response
def GET_HEADERS(self, URI, data=None, cookies=None, proxy=None, headers=None, timeout=None):
if not cookies:
cookies = self.__Cookies()
if timeout is None:
timeout = self.TIMEOUT
if isinstance(data, dict):
data = self.make_urlencoded(data)
if headers:
headers_temp = self.HEADERS
for header in headers:
headers_temp[header] = headers[header]
headers = headers_temp
elif not headers:
headers = self.HEADERS
response = {
"status":"",
"cookies":"",
"HEADERS":"",
"proxy":""
}
opener = self.make_opener(cookies, proxy)
self.__InstallOpener(opener)
try:
request = self.__Request(URI, data, headers)
response["HEADERS"] = self.__Open(url=request, data=None, timeout=timeout).info()
response["status"] = "OK"
except Exception, e:
response["status"] = e
response["cookies"] = cookies
response["proxy"] = proxy
return response
class Google:
def __init__(self):
import re
self.__findall = re.findall
self.__HTTP = HTTP()
def valid_proxy(self, proxy_string):
data = self.__HTTP.GET_HEADERS(URI="http://www.google.com", proxy=proxy_string)
if data["status"] == "OK":
if "google.com" in str(data["HEADERS"]):
return True
else:
return False
else:
return False
def get_page(self, request, page_number, proxy_string):
empty_link = u"http://www.google.com/search?hl=ru&client=opera&rls=ru&hs=67v&q={request}&start={N}&sa=N"
link = empty_link.format(request=request, N=page_number).encode("UTF-8", "replace")
data = self.__HTTP.GET(URI=link, proxy=proxy_string)
if data["status"] == "OK":
return data["page"]
else:
return None
def parse_page(self, page):
harvested_data = self.__findall(r'''\<li\ class\=g\>\<h3\ class\=r\>\<a\ href\=\"(.*?)".*?>(.*?)\<\/a\>\<\/h3\>''', page)
for data in harvested_data:
if data[0].startswith(u"/"):
harvested_data.remove(data)
if u".google.com" in data[0]:
harvested_data.remove(data)
return harvested_data
class Main:
def __init__(self):
import time
import threading
self.__sleep = time.sleep
self.__threading = threading
self.__Google = Google()
self.__REQUESTS = []
self.__PROXYS = []
self.__DEEP = 30
self.__PROCESS_AFTER = 1
self.__THREADS_COUNT = 5
self.__ENCODING = "UTF-8"
self.__write_lock = threading.RLock()
def set_settings(self, DEEEP, THREADS_COUNT, ENCODING, PROCESS_AFTER, REQUESTS_FILE, PROXYS_FILE):
try:
if not isinstance(DEEP, int):
raise ValueError
if not isinstance(THREADS_COUNT, int):
raise ValueError
if not isinstance(ENCODING, str):
raise ValueError
if not isinstance(PROCESS_AFTER, int):
raise ValueError
if not isinstance(REQUESTS_FILE, str):
raise ValueError
if not isinstance(PROXYS_FILE, str):
raise ValueError
with open(REQUESTS_FILE) as requests:
with open(PROXYS_FILE) as proxys:
for line in requests:
line = line.translate(None, "\r\n").decode(ENCODING)
for x in xrange(0, DEEP, 10):
self.__REQUESTS.append((line, x))
for line in proxys:
line = line.translate(None, "\r\n")
self.__PROXYS.append(line)
self.__DEEP = DEEP
self.__THREADS_COUNT = THREADS_COUNT
self.__PROCESS_AFTER = PROCESS_AFTER
self.__ENCODING = ENCODING
except Exception, error:
return error
def write(self, data):
self.__write_lock.acquire()
try:
with open("collected.txt", "a") as out:
for element in data:
title = element[1].replace(u"</em>", "").replace(u"<em>", "").replace(u"<b>...</b>", "")
out.write(u"{0}|{1}\n".format(element[0], title).encode(self.__ENCODING))
except IOError:
print u"Can`t write data"
finally:
self.__write_lock.release()
def worker(self):
collected_data = []
while True:
try:
request, n = self.__REQUESTS.pop(0)
except IndexError:
return
try:
proxy = self.__PROXYS.pop(0)
except IndexError:
self.__REQUESTS.append((request, n))
return
if self.__Google.valid_proxy(proxy):
self.__PROXYS.append(proxy)
page = self.__Google.get_page(request, n, proxy)
if page is not None:
collected_data.append(page)
if len(collected_data) == self.__PROCESS_AFTER:
harvested_data = []
for page in collected_data:
for element in self.__Google.parse_page(page):
harvested_data.append(element)
self.write(harvested_data)
harvested_data = []
collected_data = []
else:
self.__REQUESTS.append((request, n))
else:
self.__REQUESTS.append((request, n))
def main(self):
for _ in xrange(self.__THREADS_COUNT):
self.__threading.Thread(target=self.worker).start()
while self.__threading.active_count()>1:
self.__sleep(1)
if __name__ == "__main__":
print u"Started"
main = Main()
ex = main.set_settings(DEEP, THREADS_COUNT, ENCODING, PROCESS_AFTER, REQUESTS_FILE, PROXYS_FILE)
if not ex:
main.main()
else:
raise ex
print u"Finished"