#!usr/bin/env python
#-*- encoding: UTF-8 -*-
#
#(c) login999
# uasc.org.ua
#Класс-контейнер для хранения чего-либо
class CONTAINER:
#Атрибут инициализации
def __init__(self):
from threading import RLock
from random import choice
from random import shuffle
self.__Lock = RLock()
self.__Container = []
self.__Choice = choice
self.__Shuffle = shuffle
self.defaultencoding = "UTF-8"
#Атрибут загрузки элементов в контейнер из файла
def load(self, data, data_encoding=None):
if data_encoding is None:
data_encoding = self.defaultencoding
self.__Lock.acquire()
for line in data:
self.__Container.append(line.replace("\r", "").replace("\n", "").decode(data_encoding, "replace"))
self.__Lock.release()
#Атрибут добавления одного элемента в контейнер
def append(self, some):
self.__Lock.acquire()
if some not in self.__Container:
self.__Container.append(some)
self.__Lock.release()
#Атрибут очистки контейнера
def clear(self):
self.__Lock.acquire()
self.__Container = []
self.__Lock.release()
#Атрибут перемешивания элементов контейнера
def shuffle(self):
self.__Lock.acquire()
self.__Shuffle(self.__Container)
self.__Lock.release()
#Атрибут удаления дубликатов
def remove_dupies(self):
self.__Lock.acquire()
self.__Container = list(set(self.__Container))
self.__Lock.release()
#Атрибут получения количества загруженных элементов
def __len__(self):
return len(self.__Container)
#Атрибут получения следующего элемента
def get_next(self, remove=False):
self.__Lock.acquire()
try:
line = self.__Container.pop(0)
except IndexError:
line = None
if not remove and line is not None:
self.__Container.append(line)
self.__Lock.release()
return line
#Атрибут получения одного случайного элемента
def get_random(self, remove=False):
try:
line = self.__Choice(self.__Container)
except IndexError:
line = None
self.__Lock.acquire()
if remove and line is not None:
self.__Container.remove(line)
self.__Lock.release()
return line
#Атрибут получения всех элементов
def get_all(self, remove=False):
all = self.__Container
if remove:
self.__Lock.acquire()
self.__Container = []
self.__Lock.release()
return all
#Атрибут дампа всех элементов в файл
def dump(self, outfile, data_encoding=None, remove=False):
if data_encoding is None:
data_encoding = self.defaultencoding
self.__Lock.acquire()
with open(outfile, "a") as out:
for item in self.__Container:
out.write("{0}\n".format(item.encode(data_encoding, "replace")))
if remove:
self.__Container = []
self.__Lock.release()
#Класс для хранения/проверки проксей
class PROXYS:
#Атрибут инициализации
def __init__(self):
import urllib2
from threading import RLock
self.__ProxyHandler = urllib2.ProxyHandler
self.__Build_Opener = urllib2.build_opener
self.__Install_Opener = urllib2.install_opener
self.__Open = urllib2.urlopen
self.__Request = urllib2.Request
self.__Lock = RLock()
self.__Proxyz = []
self.USE = False
self.defaultproxy = None
#Атрибут загрузки (из последовательности/итератора)
def load(self, data):
self.__Lock.acquire()
self.__Proxyz = []
for line in data:
self.__Proxyz.append(line.replace("\r", "").replace("\n", ""))
self.__Lock.release()
#Атрибут очистки
def clear(self):
self.__Lock.acquire()
self.__Proxyz = []
self.__Lock.release()
#Атрибут удаления дублей
def remove_dupies(self):
self.__Lock.acquire()
self.__Proxyz = list(set(self.Proxyz))
self.__Lock.release()
#Атрибут получения количества загруженных проксей
def __len__(self):
return len(self.__Proxyz)
#Атрибут проверки одной данной прокси
def check(self, proxy):
proxy_handler = self.__ProxyHandler({"http": "http://"+proxy+"/"})
opener = self.__Build_Opener(proxy_handler)
self.__Install_Opener(opener)
try:
google_url = self.__Open("http://www.google.com/").geturl()
if "google" in google_url:
return True
else:
return False
except:
return False
#Атрибут проверки всех загруженных проксей
def check_all(self):
for _ in xrange(len(self.__Proxyz)):
self.__Lock.acquire()
proxy = self.__Proxyz.pop(0)
self.__Lock.release()
if len(proxy)>6:
online = self.check(proxy)
if online:
self.__Lock.acquire()
self.__Proxyz.append(proxy)
self.__Lock.release()
else:
pass
else:
pass
#Атрибут получения одной живой прокси
def get_one(self):
if not self.USE:
return self.defaultproxy
else:
while True:
try:
self.__Lock.acquire()
proxy = self.__Proxyz.pop(0)
self.__Lock.release()
except IndexError:
return "NO_PROXIS_LEFT"
if len(proxy)>6:
online = self.check(proxy)
if online:
self.__Lock.acquire()
self.__Proxyz.append(proxy)
self.__Lock.release()
return proxy
else:
pass
#Класс для определния Yandex Тиц, Google PR, Alexa AR
class SITE_PROPERTIES:
#Атрибут инициализации
def __init__(self):
import re
import urllib
import urllib2
self.__re = re
self.__urllib = urllib
self.__urllib2 = urllib2
self.timeout = 20
#Честно спизженный атрибут, нужен для определения PR
def __IntStr(self, String, Integer, Factor):
for i in range(len(String)) :
Integer *= Factor
Integer &= 0xFFFFFFFF
Integer += ord(String[i])
return Integer
#Честно спизженный атрибут, нужен для определения PR
def __HashURL(self, Str):
C1 = self.__IntStr(Str, 0x1505, 0x21)
C2 = self.__IntStr(Str, 0, 0x1003F)
C1 >>= 2
C1 = ((C1 >> 4) & 0x3FFFFC0) | (C1 & 0x3F)
C1 = ((C1 >> 4) & 0x3FFC00) | (C1 & 0x3FF)
C1 = ((C1 >> 4) & 0x3C000) | (C1 & 0x3FFF)
T1 = (C1 & 0x3C0) << 4
T1 |= C1 & 0x3C
T1 = (T1 << 2) | (C2 & 0xF0F)
T2 = (C1 & 0xFFFFC000) << 4
T2 |= C1 & 0x3C00
T2 = (T2 << 0xA) | (C2 & 0xF0F0000)
return (T1 | T2)
#Честно спизженный атрибут, нужен для определения PR
def __CheckHash(self, HashInt):
HashStr = "%u" % (HashInt)
Flag = 0
CheckByte = 0
i = len(HashStr) - 1
while i >= 0:
Byte = int(HashStr[i])
if 1 == (Flag % 2):
Byte *= 2;
Byte = Byte / 10 + Byte % 10
CheckByte += Byte
Flag += 1
i -= 1
CheckByte %= 10
if 0 != CheckByte:
CheckByte = 10 - CheckByte
if 1 == Flag % 2:
if 1 == CheckByte % 2:
CheckByte += 9
CheckByte >>= 1
return '7' + str(CheckByte) + HashStr
#Честно спизжено, подредактировано (поддержка проксей) - атрибут получения Google PR
def GetGooglePR(self, URL, PROXY=None):
if PROXY :
proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } )
opener = self.__urllib2.build_opener(proxy_handler)
self.__urllib2.install_opener(opener)
google_hash = self.__CheckHash(self.__HashURL(URL))
google_url = 'http://www.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s' % (self.__urllib.quote(URL), google_hash)
try:
page = self.__urllib2.urlopen(google_url, timeout=self.timeout).read()
except Exception, error:
return "CONNECTION_PROBLEMS"
page = page.lstrip().rstrip()
PR = page[9:]
try:
PR = int(PR)
except:
PR = 0
return str(PR).decode("UTF-8")
#Атрибут получения Alexa Rank
def GetAlexaRank(self, URL, PROXY=None):
if PROXY:
proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } )
opener = self.__urllib2.build_opener(proxy_handler)
self.__urllib2.install_opener(opener)
alexa_url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url={URL}'.format(URL=self.__urllib.quote(URL))
try:
page = self.__urllib2.urlopen(alexa_url, timeout=self.timeout).read()
except Exception, error:
return "CONNECTION_PROBLEMS"
try:
AR = self.__re.findall(r'\<POPULARITY URL\=\".*?\" TEXT\=\"(\d+)\"\/\>', page)[0]
except IndexError:
AR = 0
return str(AR).decode("UTF-8")
#Атрибут получения Yandex Тиц
def GetYandexTIC(self, URL, PROXY=None):
if "http://" not in URL:
URL = "http://{URL}".format(URL=URL)
if PROXY:
proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } )
opener = self.__urllib2.build_opener(proxy_handler)
self.__urllib2.install_opener(opener)
yandex_url = 'http://bar-navig.yandex.ru/u?ver=2&show=32&url={URL}'.format(URL=self.__urllib.quote(URL))
try:
page = self.__urllib2.urlopen(yandex_url, timeout=self.timeout).read()
except Exception, error:
return "CONNECTION_PROBLEMS"
try:
TIC = self.__re.findall(r'\<tcy rang\=\".*?\" value\=\"(\d+)\"\/\>', page)[0]
except IndexError:
TIC = 0
return str(TIC).decode("UTF-8")
#Класс для загрузки данных из файла формата DBM Apache
class DBM_LOADER:
#Атрибут инициализации
def __init__(self):
import anydbm
self.__open_database = anydbm.open
self.defaultencoding = "UTF-8"
#Атрибут загрузки из БД
def load(self, db_file, parse=True, to_=None):
if to_ is None:
data_list = []
else:
data_list = to_
database = self.__open_database(db_file.encode(self.defaultencoding), "c")
for key, value in database.iteritems():
some_seq = {}
some_seq["url"] = key.decode(self.defaultencoding)
some_seq["local_path"] = value.decode(self.defaultencoding)
some_seq["PR"] = ""
some_seq["AR"] = ""
some_seq["TIC"] = ""
if parse:
if self.martein_parsed(some_seq):
data_list.append(some_seq)
else:
pass
else:
data_list.append(some_seq)
if to_ is None:
return data_list
else:
#Значит уже загружено
pass
#Атрибут парсинга данных из базы, специально для M@rtein
def martein_parsed(self, some_seq):
key = some_seq["url"]
value = some_seq["local_path"].split("/")
if key == value[-1]:
return True
else:
return False
#Класс для загрузки данных из текстового файла
class TEXT_LOADER:
#Атрибут инициализации
def __init__(self):
self.defaultencoding = "UTF-8"
#Атрибут загрузки из файла
def load(self, file_, to_=None):
if to_ is None:
data_list = []
else:
data_list = to_
with open(file_) as input_data:
for line in input_data:
some_seq = {}
some_seq["url"] = line.replace("\r", "").replace("\n", "").decode(self.defaultencoding)
some_seq["local_path"] = ""
some_seq["PR"] = ""
some_seq["AR"] = ""
some_seq["TIC"] = ""
data_list.append(some_seq)
if to_ is None:
return data_list
else:
#Значит уже загружено
pass
#Класс для отображения Gui
class GUI:
#Атрибут инициализации
def __init__(self):
import Tkinter
import tkMessageBox
import ScrolledText
import tkFileDialog
import threading
self.__tkinter = Tkinter
self.__messagebox = tkMessageBox
self.__scrolledtext = ScrolledText
self.__tkfiledialog = tkFileDialog
self.__threading = threading
self.__main_window = Tkinter.Tk()
self.__main_window.resizable(width=False, height=False)
self.__main_window.title(u"GooglePageRank|AlexaRank|Тиц")
self.__main_window["bd"] = 5
self.__Lock = threading.RLock()
self.__PAUSE = False
self.__db_filename = None
self.__proxys_filename = None
self.__out_filename = None
self.start = None
self.pause = None
self.stop = None
#Атрибут создания настроек
def __create_settings(self):
#Обработчик кнопки загрузки БД
def load_db():
self.__db_filename = self.__tkfiledialog.askopenfilename()
#Обработчик кнопки выходного файла
def upload():
self.__out_filename = self.__tkfiledialog.asksaveasfilename(filetypes=[("Excel",".csv")], defaultextension=".csv")
#Обработчик кнопки выходного файла
def load_proxys():
self.__proxys_filename = self.__tkfiledialog.askopenfilename()
self.__settings_frame = self.__tkinter.Frame(self.__main_window)
self.__settings_frame.grid(row=0, column=0)
self.__butt_frame = self.__tkinter.Frame(self.__main_window)
self.__butt_frame.grid(row=0, column=1)
self.__TypeLabel = self.__tkinter.Label(self.__settings_frame, text=u"Тип данных :", width=12, font="system6", anchor="w")
self.__TypeLabel.grid(row=0, column=0)
self.__TypeChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, width=8)
self.__TypeChoose["values"] = [u"DBM", u"TXT"]
self.__TypeChoose.grid(row=0, column=1)
self.__EncodingLabel = self.__tkinter.Label(self.__settings_frame, text=u"Кодировка :", width=12, font="system6", anchor="w")
self.__EncodingLabel.grid(row=1, column=0)
self.__EncodingChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, width=8)
self.__EncodingChoose["values"] = [u"CP1251", u"UTF-8"]
self.__EncodingChoose.grid(row=1, column=1)
self.__ThreadsLabel = self.__tkinter.Label(self.__settings_frame, text=u"Потоков :", width=12, font="system6", anchor="w")
self.__ThreadsLabel.grid(row=2, column=0)
self.__ThreadsChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, from_=1, to_=350, width=8)
self.__ThreadsChoose.grid(row=2, column=1)
self.__DBButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Загрузить БД", command=load_db)
self.__DBButton.grid(row=0, column=0)
self.__ProxysButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Загрузить proxy", command=load_proxys)
self.__ProxysButton.grid(row=1, column=0)
self.__OUTButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Выходной файл", command=upload)
self.__OUTButton.grid(row=2, column=0)
#Атрибут создания кнопок
def __create_buttons(self):
#Обработчик кнопки Старт
def start():
if not self.__db_filename:
self.__messagebox.showerror(u"Ошибка", u"Файл БД не указан!")
return
if not self.__out_filename:
self.__messagebox.showerror(u"Ошибка", u"Выходной файл не указан!")
return
if not self.__proxys_filename:
self.__messagebox.showinfo(u"Внимание", u"Проверка будет происходить без proxy!")
settings = {}
settings["threads_count"] = int(self.__ThreadsChoose.get())
settings["datatype"] = self.__TypeChoose.get()
settings["encoding"] = self.__EncodingChoose.get()
settings["bdname"] = self.__db_filename
settings["proxysname"] = self.__proxys_filename
settings["outfilename"] = self.__out_filename
self.__threading.Thread(target=self.start, args=[settings]).start()
#Обработчик кнопки Пауза
def pause():
if not self.__PAUSE:
self.__PAUSE = True
self.__PauseButton["text"] = u"Продолжить"
self.__threading.Thread(target=self.pause, args=[self.__PAUSE]).start()
else:
self.__PAUSE = False
self.__PauseButton["text"] = u"Пауза"
self.__threading.Thread(target=self.pause, args=[self.__PAUSE]).start()
#Обработчик кнопки Стоп
def stop():
self.__threading.Thread(target=self.stop).start()
self.__buttons_frame = self.__tkinter.Frame(self.__main_window)
self.__buttons_frame.grid(row=0, column=2)
self.__StartButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Старт", command=start)
self.__StartButton.grid(row=0, column=0)
self.__PauseButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Пауза", command=pause, state="disabled")
self.__PauseButton.grid(row=1, column=0)
self.__StopButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Стоп", command=stop, state="disabled")
self.__StopButton.grid(row=2, column=0)
#Атрибут вставки в лог
def insert_into_log(self, line):
self.__Lock.acquire()
self.__Log.insert("end", u"{line}".format(line=line))
self.__Lock.release()
#Атрибут создания лога
def __create_log(self):
self.__Log = self.__scrolledtext.ScrolledText(self.__main_window, font="system6", width=35, height=12, exportselection=True)
self.__Log.grid(row=2, column=0, columnspan=3, sticky="we")
#Атрибут создания
def create(self):
self.__create_settings()
self.__create_buttons()
self.__create_log()
return self.__main_window
#Атрибут приведения Gui в "рабочее положение"
def make_working(self):
self.__StartButton["state"] = "disabled"
self.__PauseButton["state"] = "normal"
self.__StopButton["state"] = "normal"
self.__DBButton["state"] = "disabled"
self.__OUTButton["state"] = "disabled"
self.__ProxysButton["state"] = "disabled"
self.__TypeChoose["state"] = "disabled"
self.__ThreadsChoose["state"] = "disabled"
self.__EncodingChoose["state"] = "disabled"
#Атрибут приведения Gui в "стандартное положение"
def make_reset(self):
self.__db_filename = None
self.__proxys_filename = None
self.__out_filename = None
self.__StartButton["state"] = "normal"
self.__PauseButton["text"] = u"Пауза"
self.__PauseButton["state"] = "disabled"
self.__StopButton["state"] = "disabled"
self.__DBButton["state"] = "normal"
self.__OUTButton["state"] = "normal"
self.__ProxysButton["state"] = "normal"
self.__TypeChoose["state"] = "readonly"
self.__ThreadsChoose["state"] = "readonly"
self.__EncodingChoose["state"] = "readonly"
#Атрибут приведения Gui в "неактивное положение"
def make_unactive(self):
self.__StartButton["state"] = "disabled"
self.__PauseButton["state"] = "disabled"
self.__StopButton["state"] = "disabled"
self.__DBButton["state"] = "disabled"
self.__OUTButton["state"] = "disabled"
self.__ProxysButton["state"] = "disabled"
self.__TypeChoose["state"] = "disabled"
self.__ThreadsChoose["state"] = "disabled"
self.__EncodingChoose["state"] = "disabled"
#Класс для записи данных в несколько файлов
class WRITER:
#Атрибут инициализации
def __init__(self):
from threading import RLock
self.Lock = RLock
self.Locks = {}
self.defaultencoding = "UTF-8"
self.defaultout = "dump"
#Атрибут записи одного элемента
def write_one(self, data, id_=None, data_encoding=None):
if id_ is None:
id_ = self.defaultout
if data_encoding is None:
encoding = self.defaultencoding
if id_ not in self.Locks:
self.Locks[id_] = self.Lock()
self.Locks[id_].acquire()
with open(u"{0}.csv".format(id_).encode(self.defaultencoding), "a") as out:
#<Преобразование данных из словаря в строку>
data = u"{url};{local_path};{PR};{AR};{TIC}".format(url=data["url"], local_path=data["local_path"], PR=data["PR"], AR=data["AR"], TIC=data["TIC"])
#</>
out.write("{0}\n".format(data.encode(encoding, "replace")))
self.Locks[id_].release()
#Атрибут записи последовательности
def write_many(self, data, id_=None, data_encoding=None):
if id_ is None:
id_ = self.defaultout
if data_encoding is None:
encoding = self.defaultencoding
if id_ not in self.Locks:
self.Locks[id_] = self.Lock()
self.Locks[id_].acquire()
with open(u"{0}.csv".format(id_).encode(self.defaultencoding), "a") as out:
for item in data:
#<Преобразование данных из словаря в строку>
item = u"{url};{local_path};{PR};{AR};{TIC}".format(url=item["url"], local_path=item["local_path"], PR=item["PR"], AR=item["AR"], TIC=item["TIC"])
#</>
out.write("{0}\n".format(item.encode(encoding, "replace")))
self.Locks[id_].release()
#Класс-архитектура чекера
class ARCHITECTURE:
#Атрибут инициализации
def __init__(self):
import sys
import time
import threading
self.__exit = sys.exit
self.__sleep = time.sleep
self.__threading = threading
self.__lock = threading.RLock()
self.__gui = GUI()
self.__proxys = PROXYS()
self.__writer = WRITER()
self.__links = CONTAINER()
self.__dbmloader = DBM_LOADER()
self.__textloader = TEXT_LOADER()
self.__properties = SITE_PROPERTIES()
self.__PAUSE = False
self.__STOP = False
self.__current_proxy = None
#Атрибут который олицетворяет собой поток-работника
def __worker(self):
while True:
if self.__STOP:
unchecked = self.__links.get_all(remove=True)
self.__writer.write_many(unchecked, "UNCHECKED")
return
if self.__PAUSE:
self.__sleep(1)
if self.__proxys.USE and not self.__current_proxy:
self.__lock.acquire()
self.__current_proxy = self.__proxys.get_one()
self.__lock.release()
if self.__current_proxy == "NO_PROXYS_LEFT":
unchecked = self.__links.get_all(remove=True)
self.__writer.write_many(unchecked, "UNCHECKED")
current_seq = self.__links.get_next(remove=True)
if not current_seq:
if not self.__STOP:
self.stop()
return
PR = self.__properties.GetGooglePR(current_seq["url"], self.__current_proxy)
if not PR == "FORBIDDEN" and not PR == "CONNECTION_PROBLEMS":
current_seq["PR"] = PR
AR = self.__properties.GetAlexaRank(current_seq["url"], self.__current_proxy)
if not AR == "FORBIDDEN" and not AR == "CONNECTION_PROBLEMS":
current_seq["AR"] = AR
TIC = self.__properties.GetYandexTIC(current_seq["url"], self.__current_proxy)
if not TIC == "FORBIDDEN" and not TIC == "CONNECTION_PROBLEMS":
current_seq["TIC"] = TIC
self.__writer.write_one(current_seq)
self.__gui.insert_into_log(u"{url}|{PR}|{AR}|{TIC}\n".format(url=current_seq["url"], PR=current_seq["PR"], AR=current_seq["AR"], TIC=current_seq["TIC"]))
else:
self.__links.append(current_seq)
self.__lock.acquire()
self.__current_proxy = self.__proxys.get_one()
self.__lock.release()
else:
self.__links.append(current_seq)
self.__lock.acquire()
self.__current_proxy = self.__proxys.get_one()
self.__lock.release()
else:
self.__links.append(current_seq)
self.__lock.acquire()
self.__current_proxy = self.__proxys.get_one()
self.__lock.release()
#Атрибут старта
def start(self, settings):
self.__gui.make_working()
self.__writer.defaultencoding = settings["encoding"]
self.__writer.defaultout = settings["outfilename"]
self.__links.defaultencoding = settings["encoding"]
self.__dbmloader.defaultencoding = settings["encoding"]
self.__textloader.defaultencoding = settings["encoding"]
if settings["datatype"] == u"DBM":
self.__dbmloader.load(settings["bdname"], to_=self.__links)
if settings["datatype"] == u"TXT":
self.__textloader.load(settings["bdname"], to_=self.__links)
if settings["proxysname"] is not None:
with open(u"{filename}".format(filename=settings["proxysname"]).encode(settings["encoding"])) as prx:
self.__proxys.load(prx)
if len(self.__proxys)>0:
self.__proxys.USE = True
for _ in xrange(settings["threads_count"]):
self.__threading.Thread(target=self.__worker).start()
#Атрибут паузы
def pause(self, value):
self.__PAUSE = value
#Атрибут остановки
def stop(self):
self.__gui.make_unactive()
self.__STOP = True
while self.__threading.active_count()>2:
self.__sleep(1)
self.__PAUSE = False
self.__STOP = False
self.__proxys.USE = False
self.__current_proxy = False
self.__writer.defaultout = "dump"
self.__gui.make_reset()
#Атрибут запуска всего :)
def start_process(self):
self.__gui.start = self.start
self.__gui.pause = self.pause
self.__gui.stop = self.stop
gui_window = self.__gui.create()
gui_window.protocol("WM_DELETE_WINDOW", self.__exit)
gui_window.mainloop()
if __name__ == "__main__":
Architecture = ARCHITECTURE()
Architecture.start_process()
Скрипт для массовой проверки PR, AlexaRank, Тиц