#!usr/bin/env python #-*- encoding: UTF-8 -*- # #(c) login999 # uasc.org.ua #Класс-контейнер для хранения чего-либо class CONTAINER: #Атрибут инициализации def __init__(self): from threading import RLock from random import choice from random import shuffle self.__Lock = RLock() self.__Container = [] self.__Choice = choice self.__Shuffle = shuffle self.defaultencoding = "UTF-8" #Атрибут загрузки элементов в контейнер из файла def load(self, data, data_encoding=None): if data_encoding is None: data_encoding = self.defaultencoding self.__Lock.acquire() for line in data: self.__Container.append(line.replace("\r", "").replace("\n", "").decode(data_encoding, "replace")) self.__Lock.release() #Атрибут добавления одного элемента в контейнер def append(self, some): self.__Lock.acquire() if some not in self.__Container: self.__Container.append(some) self.__Lock.release() #Атрибут очистки контейнера def clear(self): self.__Lock.acquire() self.__Container = [] self.__Lock.release() #Атрибут перемешивания элементов контейнера def shuffle(self): self.__Lock.acquire() self.__Shuffle(self.__Container) self.__Lock.release() #Атрибут удаления дубликатов def remove_dupies(self): self.__Lock.acquire() self.__Container = list(set(self.__Container)) self.__Lock.release() #Атрибут получения количества загруженных элементов def __len__(self): return len(self.__Container) #Атрибут получения следующего элемента def get_next(self, remove=False): self.__Lock.acquire() try: line = self.__Container.pop(0) except IndexError: line = None if not remove and line is not None: self.__Container.append(line) self.__Lock.release() return line #Атрибут получения одного случайного элемента def get_random(self, remove=False): try: line = self.__Choice(self.__Container) except IndexError: line = None self.__Lock.acquire() if remove and line is not None: self.__Container.remove(line) self.__Lock.release() return line #Атрибут получения всех элементов def get_all(self, remove=False): all = self.__Container if remove: self.__Lock.acquire() self.__Container = [] self.__Lock.release() return all #Атрибут дампа всех элементов в файл def dump(self, outfile, data_encoding=None, remove=False): if data_encoding is None: data_encoding = self.defaultencoding self.__Lock.acquire() with open(outfile, "a") as out: for item in self.__Container: out.write("{0}\n".format(item.encode(data_encoding, "replace"))) if remove: self.__Container = [] self.__Lock.release() #Класс для хранения/проверки проксей class PROXYS: #Атрибут инициализации def __init__(self): import urllib2 from threading import RLock self.__ProxyHandler = urllib2.ProxyHandler self.__Build_Opener = urllib2.build_opener self.__Install_Opener = urllib2.install_opener self.__Open = urllib2.urlopen self.__Request = urllib2.Request self.__Lock = RLock() self.__Proxyz = [] self.USE = False self.defaultproxy = None #Атрибут загрузки (из последовательности/итератора) def load(self, data): self.__Lock.acquire() self.__Proxyz = [] for line in data: self.__Proxyz.append(line.replace("\r", "").replace("\n", "")) self.__Lock.release() #Атрибут очистки def clear(self): self.__Lock.acquire() self.__Proxyz = [] self.__Lock.release() #Атрибут удаления дублей def remove_dupies(self): self.__Lock.acquire() self.__Proxyz = list(set(self.Proxyz)) self.__Lock.release() #Атрибут получения количества загруженных проксей def __len__(self): return len(self.__Proxyz) #Атрибут проверки одной данной прокси def check(self, proxy): proxy_handler = self.__ProxyHandler({"http": "http://"+proxy+"/"}) opener = self.__Build_Opener(proxy_handler) self.__Install_Opener(opener) try: google_url = self.__Open("http://www.google.com/").geturl() if "google" in google_url: return True else: return False except: return False #Атрибут проверки всех загруженных проксей def check_all(self): for _ in xrange(len(self.__Proxyz)): self.__Lock.acquire() proxy = self.__Proxyz.pop(0) self.__Lock.release() if len(proxy)>6: online = self.check(proxy) if online: self.__Lock.acquire() self.__Proxyz.append(proxy) self.__Lock.release() else: pass else: pass #Атрибут получения одной живой прокси def get_one(self): if not self.USE: return self.defaultproxy else: while True: try: self.__Lock.acquire() proxy = self.__Proxyz.pop(0) self.__Lock.release() except IndexError: return "NO_PROXIS_LEFT" if len(proxy)>6: online = self.check(proxy) if online: self.__Lock.acquire() self.__Proxyz.append(proxy) self.__Lock.release() return proxy else: pass #Класс для определния Yandex Тиц, Google PR, Alexa AR class SITE_PROPERTIES: #Атрибут инициализации def __init__(self): import re import urllib import urllib2 self.__re = re self.__urllib = urllib self.__urllib2 = urllib2 self.timeout = 20 #Честно спизженный атрибут, нужен для определения PR def __IntStr(self, String, Integer, Factor): for i in range(len(String)) : Integer *= Factor Integer &= 0xFFFFFFFF Integer += ord(String[i]) return Integer #Честно спизженный атрибут, нужен для определения PR def __HashURL(self, Str): C1 = self.__IntStr(Str, 0x1505, 0x21) C2 = self.__IntStr(Str, 0, 0x1003F) C1 >>= 2 C1 = ((C1 >> 4) & 0x3FFFFC0) | (C1 & 0x3F) C1 = ((C1 >> 4) & 0x3FFC00) | (C1 & 0x3FF) C1 = ((C1 >> 4) & 0x3C000) | (C1 & 0x3FFF) T1 = (C1 & 0x3C0) << 4 T1 |= C1 & 0x3C T1 = (T1 << 2) | (C2 & 0xF0F) T2 = (C1 & 0xFFFFC000) << 4 T2 |= C1 & 0x3C00 T2 = (T2 << 0xA) | (C2 & 0xF0F0000) return (T1 | T2) #Честно спизженный атрибут, нужен для определения PR def __CheckHash(self, HashInt): HashStr = "%u" % (HashInt) Flag = 0 CheckByte = 0 i = len(HashStr) - 1 while i >= 0: Byte = int(HashStr[i]) if 1 == (Flag % 2): Byte *= 2; Byte = Byte / 10 + Byte % 10 CheckByte += Byte Flag += 1 i -= 1 CheckByte %= 10 if 0 != CheckByte: CheckByte = 10 - CheckByte if 1 == Flag % 2: if 1 == CheckByte % 2: CheckByte += 9 CheckByte >>= 1 return '7' + str(CheckByte) + HashStr #Честно спизжено, подредактировано (поддержка проксей) - атрибут получения Google PR def GetGooglePR(self, URL, PROXY=None): if PROXY : proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } ) opener = self.__urllib2.build_opener(proxy_handler) self.__urllib2.install_opener(opener) google_hash = self.__CheckHash(self.__HashURL(URL)) google_url = 'http://www.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s' % (self.__urllib.quote(URL), google_hash) try: page = self.__urllib2.urlopen(google_url, timeout=self.timeout).read() except Exception, error: return "CONNECTION_PROBLEMS" page = page.lstrip().rstrip() PR = page[9:] try: PR = int(PR) except: PR = 0 return str(PR).decode("UTF-8") #Атрибут получения Alexa Rank def GetAlexaRank(self, URL, PROXY=None): if PROXY: proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } ) opener = self.__urllib2.build_opener(proxy_handler) self.__urllib2.install_opener(opener) alexa_url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url={URL}'.format(URL=self.__urllib.quote(URL)) try: page = self.__urllib2.urlopen(alexa_url, timeout=self.timeout).read() except Exception, error: return "CONNECTION_PROBLEMS" try: AR = self.__re.findall(r'\', page)[0] except IndexError: AR = 0 return str(AR).decode("UTF-8") #Атрибут получения Yandex Тиц def GetYandexTIC(self, URL, PROXY=None): if "http://" not in URL: URL = "http://{URL}".format(URL=URL) if PROXY: proxy_handler = self.__urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } ) opener = self.__urllib2.build_opener(proxy_handler) self.__urllib2.install_opener(opener) yandex_url = 'http://bar-navig.yandex.ru/u?ver=2&show=32&url={URL}'.format(URL=self.__urllib.quote(URL)) try: page = self.__urllib2.urlopen(yandex_url, timeout=self.timeout).read() except Exception, error: return "CONNECTION_PROBLEMS" try: TIC = self.__re.findall(r'\', page)[0] except IndexError: TIC = 0 return str(TIC).decode("UTF-8") #Класс для загрузки данных из файла формата DBM Apache class DBM_LOADER: #Атрибут инициализации def __init__(self): import anydbm self.__open_database = anydbm.open self.defaultencoding = "UTF-8" #Атрибут загрузки из БД def load(self, db_file, parse=True, to_=None): if to_ is None: data_list = [] else: data_list = to_ database = self.__open_database(db_file.encode(self.defaultencoding), "c") for key, value in database.iteritems(): some_seq = {} some_seq["url"] = key.decode(self.defaultencoding) some_seq["local_path"] = value.decode(self.defaultencoding) some_seq["PR"] = "" some_seq["AR"] = "" some_seq["TIC"] = "" if parse: if self.martein_parsed(some_seq): data_list.append(some_seq) else: pass else: data_list.append(some_seq) if to_ is None: return data_list else: #Значит уже загружено pass #Атрибут парсинга данных из базы, специально для M@rtein def martein_parsed(self, some_seq): key = some_seq["url"] value = some_seq["local_path"].split("/") if key == value[-1]: return True else: return False #Класс для загрузки данных из текстового файла class TEXT_LOADER: #Атрибут инициализации def __init__(self): self.defaultencoding = "UTF-8" #Атрибут загрузки из файла def load(self, file_, to_=None): if to_ is None: data_list = [] else: data_list = to_ with open(file_) as input_data: for line in input_data: some_seq = {} some_seq["url"] = line.replace("\r", "").replace("\n", "").decode(self.defaultencoding) some_seq["local_path"] = "" some_seq["PR"] = "" some_seq["AR"] = "" some_seq["TIC"] = "" data_list.append(some_seq) if to_ is None: return data_list else: #Значит уже загружено pass #Класс для отображения Gui class GUI: #Атрибут инициализации def __init__(self): import Tkinter import tkMessageBox import ScrolledText import tkFileDialog import threading self.__tkinter = Tkinter self.__messagebox = tkMessageBox self.__scrolledtext = ScrolledText self.__tkfiledialog = tkFileDialog self.__threading = threading self.__main_window = Tkinter.Tk() self.__main_window.resizable(width=False, height=False) self.__main_window.title(u"GooglePageRank|AlexaRank|Тиц") self.__main_window["bd"] = 5 self.__Lock = threading.RLock() self.__PAUSE = False self.__db_filename = None self.__proxys_filename = None self.__out_filename = None self.start = None self.pause = None self.stop = None #Атрибут создания настроек def __create_settings(self): #Обработчик кнопки загрузки БД def load_db(): self.__db_filename = self.__tkfiledialog.askopenfilename() #Обработчик кнопки выходного файла def upload(): self.__out_filename = self.__tkfiledialog.asksaveasfilename(filetypes=[("Excel",".csv")], defaultextension=".csv") #Обработчик кнопки выходного файла def load_proxys(): self.__proxys_filename = self.__tkfiledialog.askopenfilename() self.__settings_frame = self.__tkinter.Frame(self.__main_window) self.__settings_frame.grid(row=0, column=0) self.__butt_frame = self.__tkinter.Frame(self.__main_window) self.__butt_frame.grid(row=0, column=1) self.__TypeLabel = self.__tkinter.Label(self.__settings_frame, text=u"Тип данных :", width=12, font="system6", anchor="w") self.__TypeLabel.grid(row=0, column=0) self.__TypeChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, width=8) self.__TypeChoose["values"] = [u"DBM", u"TXT"] self.__TypeChoose.grid(row=0, column=1) self.__EncodingLabel = self.__tkinter.Label(self.__settings_frame, text=u"Кодировка :", width=12, font="system6", anchor="w") self.__EncodingLabel.grid(row=1, column=0) self.__EncodingChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, width=8) self.__EncodingChoose["values"] = [u"CP1251", u"UTF-8"] self.__EncodingChoose.grid(row=1, column=1) self.__ThreadsLabel = self.__tkinter.Label(self.__settings_frame, text=u"Потоков :", width=12, font="system6", anchor="w") self.__ThreadsLabel.grid(row=2, column=0) self.__ThreadsChoose = self.__tkinter.Spinbox(self.__settings_frame, state="readonly", wrap=True, from_=1, to_=350, width=8) self.__ThreadsChoose.grid(row=2, column=1) self.__DBButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Загрузить БД", command=load_db) self.__DBButton.grid(row=0, column=0) self.__ProxysButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Загрузить proxy", command=load_proxys) self.__ProxysButton.grid(row=1, column=0) self.__OUTButton = self.__tkinter.Button(self.__butt_frame, width=15, text=u"Выходной файл", command=upload) self.__OUTButton.grid(row=2, column=0) #Атрибут создания кнопок def __create_buttons(self): #Обработчик кнопки Старт def start(): if not self.__db_filename: self.__messagebox.showerror(u"Ошибка", u"Файл БД не указан!") return if not self.__out_filename: self.__messagebox.showerror(u"Ошибка", u"Выходной файл не указан!") return if not self.__proxys_filename: self.__messagebox.showinfo(u"Внимание", u"Проверка будет происходить без proxy!") settings = {} settings["threads_count"] = int(self.__ThreadsChoose.get()) settings["datatype"] = self.__TypeChoose.get() settings["encoding"] = self.__EncodingChoose.get() settings["bdname"] = self.__db_filename settings["proxysname"] = self.__proxys_filename settings["outfilename"] = self.__out_filename self.__threading.Thread(target=self.start, args=[settings]).start() #Обработчик кнопки Пауза def pause(): if not self.__PAUSE: self.__PAUSE = True self.__PauseButton["text"] = u"Продолжить" self.__threading.Thread(target=self.pause, args=[self.__PAUSE]).start() else: self.__PAUSE = False self.__PauseButton["text"] = u"Пауза" self.__threading.Thread(target=self.pause, args=[self.__PAUSE]).start() #Обработчик кнопки Стоп def stop(): self.__threading.Thread(target=self.stop).start() self.__buttons_frame = self.__tkinter.Frame(self.__main_window) self.__buttons_frame.grid(row=0, column=2) self.__StartButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Старт", command=start) self.__StartButton.grid(row=0, column=0) self.__PauseButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Пауза", command=pause, state="disabled") self.__PauseButton.grid(row=1, column=0) self.__StopButton = self.__tkinter.Button(self.__buttons_frame, width=12, text=u"Стоп", command=stop, state="disabled") self.__StopButton.grid(row=2, column=0) #Атрибут вставки в лог def insert_into_log(self, line): self.__Lock.acquire() self.__Log.insert("end", u"{line}".format(line=line)) self.__Lock.release() #Атрибут создания лога def __create_log(self): self.__Log = self.__scrolledtext.ScrolledText(self.__main_window, font="system6", width=35, height=12, exportselection=True) self.__Log.grid(row=2, column=0, columnspan=3, sticky="we") #Атрибут создания def create(self): self.__create_settings() self.__create_buttons() self.__create_log() return self.__main_window #Атрибут приведения Gui в "рабочее положение" def make_working(self): self.__StartButton["state"] = "disabled" self.__PauseButton["state"] = "normal" self.__StopButton["state"] = "normal" self.__DBButton["state"] = "disabled" self.__OUTButton["state"] = "disabled" self.__ProxysButton["state"] = "disabled" self.__TypeChoose["state"] = "disabled" self.__ThreadsChoose["state"] = "disabled" self.__EncodingChoose["state"] = "disabled" #Атрибут приведения Gui в "стандартное положение" def make_reset(self): self.__db_filename = None self.__proxys_filename = None self.__out_filename = None self.__StartButton["state"] = "normal" self.__PauseButton["text"] = u"Пауза" self.__PauseButton["state"] = "disabled" self.__StopButton["state"] = "disabled" self.__DBButton["state"] = "normal" self.__OUTButton["state"] = "normal" self.__ProxysButton["state"] = "normal" self.__TypeChoose["state"] = "readonly" self.__ThreadsChoose["state"] = "readonly" self.__EncodingChoose["state"] = "readonly" #Атрибут приведения Gui в "неактивное положение" def make_unactive(self): self.__StartButton["state"] = "disabled" self.__PauseButton["state"] = "disabled" self.__StopButton["state"] = "disabled" self.__DBButton["state"] = "disabled" self.__OUTButton["state"] = "disabled" self.__ProxysButton["state"] = "disabled" self.__TypeChoose["state"] = "disabled" self.__ThreadsChoose["state"] = "disabled" self.__EncodingChoose["state"] = "disabled" #Класс для записи данных в несколько файлов class WRITER: #Атрибут инициализации def __init__(self): from threading import RLock self.Lock = RLock self.Locks = {} self.defaultencoding = "UTF-8" self.defaultout = "dump" #Атрибут записи одного элемента def write_one(self, data, id_=None, data_encoding=None): if id_ is None: id_ = self.defaultout if data_encoding is None: encoding = self.defaultencoding if id_ not in self.Locks: self.Locks[id_] = self.Lock() self.Locks[id_].acquire() with open(u"{0}.csv".format(id_).encode(self.defaultencoding), "a") as out: #<Преобразование данных из словаря в строку> data = u"{url};{local_path};{PR};{AR};{TIC}".format(url=data["url"], local_path=data["local_path"], PR=data["PR"], AR=data["AR"], TIC=data["TIC"]) # out.write("{0}\n".format(data.encode(encoding, "replace"))) self.Locks[id_].release() #Атрибут записи последовательности def write_many(self, data, id_=None, data_encoding=None): if id_ is None: id_ = self.defaultout if data_encoding is None: encoding = self.defaultencoding if id_ not in self.Locks: self.Locks[id_] = self.Lock() self.Locks[id_].acquire() with open(u"{0}.csv".format(id_).encode(self.defaultencoding), "a") as out: for item in data: #<Преобразование данных из словаря в строку> item = u"{url};{local_path};{PR};{AR};{TIC}".format(url=item["url"], local_path=item["local_path"], PR=item["PR"], AR=item["AR"], TIC=item["TIC"]) # out.write("{0}\n".format(item.encode(encoding, "replace"))) self.Locks[id_].release() #Класс-архитектура чекера class ARCHITECTURE: #Атрибут инициализации def __init__(self): import sys import time import threading self.__exit = sys.exit self.__sleep = time.sleep self.__threading = threading self.__lock = threading.RLock() self.__gui = GUI() self.__proxys = PROXYS() self.__writer = WRITER() self.__links = CONTAINER() self.__dbmloader = DBM_LOADER() self.__textloader = TEXT_LOADER() self.__properties = SITE_PROPERTIES() self.__PAUSE = False self.__STOP = False self.__current_proxy = None #Атрибут который олицетворяет собой поток-работника def __worker(self): while True: if self.__STOP: unchecked = self.__links.get_all(remove=True) self.__writer.write_many(unchecked, "UNCHECKED") return if self.__PAUSE: self.__sleep(1) if self.__proxys.USE and not self.__current_proxy: self.__lock.acquire() self.__current_proxy = self.__proxys.get_one() self.__lock.release() if self.__current_proxy == "NO_PROXYS_LEFT": unchecked = self.__links.get_all(remove=True) self.__writer.write_many(unchecked, "UNCHECKED") current_seq = self.__links.get_next(remove=True) if not current_seq: if not self.__STOP: self.stop() return PR = self.__properties.GetGooglePR(current_seq["url"], self.__current_proxy) if not PR == "FORBIDDEN" and not PR == "CONNECTION_PROBLEMS": current_seq["PR"] = PR AR = self.__properties.GetAlexaRank(current_seq["url"], self.__current_proxy) if not AR == "FORBIDDEN" and not AR == "CONNECTION_PROBLEMS": current_seq["AR"] = AR TIC = self.__properties.GetYandexTIC(current_seq["url"], self.__current_proxy) if not TIC == "FORBIDDEN" and not TIC == "CONNECTION_PROBLEMS": current_seq["TIC"] = TIC self.__writer.write_one(current_seq) self.__gui.insert_into_log(u"{url}|{PR}|{AR}|{TIC}\n".format(url=current_seq["url"], PR=current_seq["PR"], AR=current_seq["AR"], TIC=current_seq["TIC"])) else: self.__links.append(current_seq) self.__lock.acquire() self.__current_proxy = self.__proxys.get_one() self.__lock.release() else: self.__links.append(current_seq) self.__lock.acquire() self.__current_proxy = self.__proxys.get_one() self.__lock.release() else: self.__links.append(current_seq) self.__lock.acquire() self.__current_proxy = self.__proxys.get_one() self.__lock.release() #Атрибут старта def start(self, settings): self.__gui.make_working() self.__writer.defaultencoding = settings["encoding"] self.__writer.defaultout = settings["outfilename"] self.__links.defaultencoding = settings["encoding"] self.__dbmloader.defaultencoding = settings["encoding"] self.__textloader.defaultencoding = settings["encoding"] if settings["datatype"] == u"DBM": self.__dbmloader.load(settings["bdname"], to_=self.__links) if settings["datatype"] == u"TXT": self.__textloader.load(settings["bdname"], to_=self.__links) if settings["proxysname"] is not None: with open(u"{filename}".format(filename=settings["proxysname"]).encode(settings["encoding"])) as prx: self.__proxys.load(prx) if len(self.__proxys)>0: self.__proxys.USE = True for _ in xrange(settings["threads_count"]): self.__threading.Thread(target=self.__worker).start() #Атрибут паузы def pause(self, value): self.__PAUSE = value #Атрибут остановки def stop(self): self.__gui.make_unactive() self.__STOP = True while self.__threading.active_count()>2: self.__sleep(1) self.__PAUSE = False self.__STOP = False self.__proxys.USE = False self.__current_proxy = False self.__writer.defaultout = "dump" self.__gui.make_reset() #Атрибут запуска всего :) def start_process(self): self.__gui.start = self.start self.__gui.pause = self.pause self.__gui.stop = self.stop gui_window = self.__gui.create() gui_window.protocol("WM_DELETE_WINDOW", self.__exit) gui_window.mainloop() if __name__ == "__main__": Architecture = ARCHITECTURE() Architecture.start_process()