# -*- coding: utf-8 -*-
"""
Crawl market categories
"""
from models.base_category import BaseCategory
from grab.spider import Spider, Task
from spiders.base import BaseSpider
from config import Session
from datetime import datetime, timedelta
from time import gmtime, strftime
import models.market as yandex
import logging
from random import randint
from grab.proxylist import ProxyList, ProxySource
PROXY_TIMEOUT = 360
logger = logging.getLogger('grab.spider.base_category')
class MarketCategorySpider(BaseSpider):
def prepare(self):
self.used_proxies = {}
def task_generator(self):
logger.debug('CREATING SESSION FOR BASE CATEGORIES')
session = Session()
yield Task('parse_categories_1', url=yandex.MARKET_URL, session=session)
def task_parse_categories_1(self, grab, task):
logger.info(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / YANDEX MARKET / ' + task.url.encode('utf8'))
# Parse level_1 categories
categories = None
try:
categories = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_1)
except IndexError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO CATEGORIES_1 IN ' + task.url.encode('utf8'))
except AttributeError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_1')
if categories:
for cat in categories:
try:
name = cat.select(yandex.MARKET_XPATH_CATEGORY_1_TEXT).text().strip()
except IndexError:
pass
try:
cat.select(yandex.MARKET_XPATH_CATEGORY_1_LINK).text().strip()
except IndexError:
for task in self.parse_categories(grab, cat, None, 2, task.session, task.url):
yield task
else:
hash_id = unicode(name) + ' / '
data = {
'name': name,
'hash': hash_id,
}
self.save(data, BaseCategory, task.session, task.url)
for new_task in self.parse_categories(grab, cat, hash_id, 2, task.session, task.url):
yield new_task
def parse_categories(self, grab, node, parent_hash, level, session, url):
xpath = yandex.MARKET_XPATH_CATEGORY_2
xpath_text = yandex.MARKET_XPATH_CATEGORY_2_TEXT
xpath_link = yandex.MARKET_XPATH_CATEGORY_2_LINK
if level == 3:
xpath = yandex.MARKET_XPATH_CATEGORY_3
xpath_text = yandex.MARKET_XPATH_CATEGORY_3_TEXT
xpath_link = yandex.MARKET_XPATH_CATEGORY_3_LINK
categories = None
try:
categories = node.select(xpath)
except IndexError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO CATEGORIES_2 IN ' + url.encode('utf8'))
except AttributeError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_2')
if categories:
for cat in categories:
try:
name = cat.select(xpath_text).text().strip()
link = cat.select(xpath_link).text().strip()
except IndexError:
yield None
else:
hash_id = unicode(name) + ' / ' + unicode(link)
data = {
'name': name,
'link': link,
'hash': hash_id,
'parent_hash': parent_hash
}
yield Task('parse_category_models',
url=grab.make_url_absolute(link),
session=session,
data=data,
delay=randint(10, 50),
# delay=PROXY_TIMEOUT,
raw=True)
else:
yield None
def task_parse_category_models(self, grab, task):
# print grab.request_headers
# print ''
# print grab.response.headers
# print grab.response.body
# print ''
try:
proxy, proxy_userpwd, proxy_type = self.proxy
self.used_proxies[proxy] = datetime.now()
except Exception:
pass
if grab.response.code == 302:
self.tasks_failed += 1
# logger.error('FAILED: ' + task.url)
yield Task('parse_category_models',
url=task.url,
session=task.session,
data=task.data,
delay=randint(10, 50),
# delay=PROXY_TIMEOUT,
raw=True)
else:
logger.info(
strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / ('+
str(self.items_total) + ') / parse_category_models / ' + task.url.encode('utf8'))
try:
task.data['link'] = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_CONTENT).text().strip()
except IndexError:
task.data['link'] = None
self.save(task.data, BaseCategory, task.session, task.url)
for new_task in self.parse_categories(grab, grab.doc, task.data['hash'], 3, task.session, task.url):
yield new_task
else:
self.save(task.data, BaseCategory, task.session, task.url)
def task_parse_category_models_fallback(self, task):
self.tasks_failed += 1
logger.error('FAILED: ' + task.url)
# Log total count after finish
def shutdown(self):
logger.info('TOTAL INSERTED: ' + str(self.items_total))
logger.debug('TOTAL FAILED: ' + str(self.tasks_failed))
def change_proxy(self, task, grab):
"""
Assign new proxy from proxylist to the task.
"""
if task.use_proxylist and self.proxylist_enabled:
if self.proxy_auto_change:
proxy_item = None
while True:
proxy_item = self.proxylist.get_next()
proxy, proxy_userpwd, proxy_type = proxy_item
if proxy not in self.used_proxies.keys():
break
else:
# print self.used_proxies[proxy]
# print proxy
if datetime.now() - self.used_proxies[proxy] > timedelta(seconds=PROXY_TIMEOUT):
break
self.proxy = proxy_item
if self.proxy:
proxy, proxy_userpwd, proxy_type = self.proxy
grab.setup(proxy=proxy, proxy_userpwd=proxy_userpwd,
proxy_type=proxy_type)