# -*- coding: utf-8 -*- """ Crawl market categories """ from models.base_category import BaseCategory from grab.spider import Spider, Task from spiders.base import BaseSpider from config import Session from datetime import datetime, timedelta from time import gmtime, strftime import models.market as yandex import logging from random import randint from grab.proxylist import ProxyList, ProxySource PROXY_TIMEOUT = 360 logger = logging.getLogger('grab.spider.base_category') class MarketCategorySpider(BaseSpider): def prepare(self): self.used_proxies = {} def task_generator(self): logger.debug('CREATING SESSION FOR BASE CATEGORIES') session = Session() yield Task('parse_categories_1', url=yandex.MARKET_URL, session=session) def task_parse_categories_1(self, grab, task): logger.info(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / YANDEX MARKET / ' + task.url.encode('utf8')) # Parse level_1 categories categories = None try: categories = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_1) except IndexError: logger.error(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / NO CATEGORIES_1 IN ' + task.url.encode('utf8')) except AttributeError: logger.error(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_1') if categories: for cat in categories: try: name = cat.select(yandex.MARKET_XPATH_CATEGORY_1_TEXT).text().strip() except IndexError: pass try: cat.select(yandex.MARKET_XPATH_CATEGORY_1_LINK).text().strip() except IndexError: for task in self.parse_categories(grab, cat, None, 2, task.session, task.url): yield task else: hash_id = unicode(name) + ' / ' data = { 'name': name, 'hash': hash_id, } self.save(data, BaseCategory, task.session, task.url) for new_task in self.parse_categories(grab, cat, hash_id, 2, task.session, task.url): yield new_task def parse_categories(self, grab, node, parent_hash, level, session, url): xpath = yandex.MARKET_XPATH_CATEGORY_2 xpath_text = yandex.MARKET_XPATH_CATEGORY_2_TEXT xpath_link = yandex.MARKET_XPATH_CATEGORY_2_LINK if level == 3: xpath = yandex.MARKET_XPATH_CATEGORY_3 xpath_text = yandex.MARKET_XPATH_CATEGORY_3_TEXT xpath_link = yandex.MARKET_XPATH_CATEGORY_3_LINK categories = None try: categories = node.select(xpath) except IndexError: logger.error(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / NO CATEGORIES_2 IN ' + url.encode('utf8')) except AttributeError: logger.error(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_2') if categories: for cat in categories: try: name = cat.select(xpath_text).text().strip() link = cat.select(xpath_link).text().strip() except IndexError: yield None else: hash_id = unicode(name) + ' / ' + unicode(link) data = { 'name': name, 'link': link, 'hash': hash_id, 'parent_hash': parent_hash } yield Task('parse_category_models', url=grab.make_url_absolute(link), session=session, data=data, delay=randint(10, 50), # delay=PROXY_TIMEOUT, raw=True) else: yield None def task_parse_category_models(self, grab, task): # print grab.request_headers # print '' # print grab.response.headers # print grab.response.body # print '' try: proxy, proxy_userpwd, proxy_type = self.proxy self.used_proxies[proxy] = datetime.now() except Exception: pass if grab.response.code == 302: self.tasks_failed += 1 # logger.error('FAILED: ' + task.url) yield Task('parse_category_models', url=task.url, session=task.session, data=task.data, delay=randint(10, 50), # delay=PROXY_TIMEOUT, raw=True) else: logger.info( strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / ('+ str(self.items_total) + ') / parse_category_models / ' + task.url.encode('utf8')) try: task.data['link'] = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_CONTENT).text().strip() except IndexError: task.data['link'] = None self.save(task.data, BaseCategory, task.session, task.url) for new_task in self.parse_categories(grab, grab.doc, task.data['hash'], 3, task.session, task.url): yield new_task else: self.save(task.data, BaseCategory, task.session, task.url) def task_parse_category_models_fallback(self, task): self.tasks_failed += 1 logger.error('FAILED: ' + task.url) # Log total count after finish def shutdown(self): logger.info('TOTAL INSERTED: ' + str(self.items_total)) logger.debug('TOTAL FAILED: ' + str(self.tasks_failed)) def change_proxy(self, task, grab): """ Assign new proxy from proxylist to the task. """ if task.use_proxylist and self.proxylist_enabled: if self.proxy_auto_change: proxy_item = None while True: proxy_item = self.proxylist.get_next() proxy, proxy_userpwd, proxy_type = proxy_item if proxy not in self.used_proxies.keys(): break else: # print self.used_proxies[proxy] # print proxy if datetime.now() - self.used_proxies[proxy] > timedelta(seconds=PROXY_TIMEOUT): break self.proxy = proxy_item if self.proxy: proxy, proxy_userpwd, proxy_type = self.proxy grab.setup(proxy=proxy, proxy_userpwd=proxy_userpwd, proxy_type=proxy_type)