coding utf-8 Crawl market categories from models base_category import

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
"""
Crawl market categories
"""
from models.base_category import BaseCategory
from grab.spider import Spider, Task
from spiders.base import BaseSpider
from config import Session
from datetime import datetime, timedelta
from time import gmtime, strftime
import models.market as yandex
import logging
from random import randint
from grab.proxylist import ProxyList, ProxySource
PROXY_TIMEOUT = 360
logger = logging.getLogger('grab.spider.base_category')
class MarketCategorySpider(BaseSpider):
def prepare(self):
self.used_proxies = {}
def task_generator(self):
logger.debug('CREATING SESSION FOR BASE CATEGORIES')
session = Session()
yield Task('parse_categories_1', url=yandex.MARKET_URL, session=session)
def task_parse_categories_1(self, grab, task):
logger.info(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / YANDEX MARKET / ' + task.url.encode('utf8'))
# Parse level_1 categories
categories = None
try:
categories = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_1)
except IndexError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO CATEGORIES_1 IN ' + task.url.encode('utf8'))
except AttributeError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_1')
if categories:
for cat in categories:
try:
name = cat.select(yandex.MARKET_XPATH_CATEGORY_1_TEXT).text().strip()
except IndexError:
pass
try:
cat.select(yandex.MARKET_XPATH_CATEGORY_1_LINK).text().strip()
except IndexError:
for task in self.parse_categories(grab, cat, None, 2, task.session, task.url):
yield task
else:
hash_id = unicode(name) + ' / '
data = {
'name': name,
'hash': hash_id,
}
self.save(data, BaseCategory, task.session, task.url)
for new_task in self.parse_categories(grab, cat, hash_id, 2, task.session, task.url):
yield new_task
def parse_categories(self, grab, node, parent_hash, level, session, url):
xpath = yandex.MARKET_XPATH_CATEGORY_2
xpath_text = yandex.MARKET_XPATH_CATEGORY_2_TEXT
xpath_link = yandex.MARKET_XPATH_CATEGORY_2_LINK
if level == 3:
xpath = yandex.MARKET_XPATH_CATEGORY_3
xpath_text = yandex.MARKET_XPATH_CATEGORY_3_TEXT
xpath_link = yandex.MARKET_XPATH_CATEGORY_3_LINK
categories = None
try:
categories = node.select(xpath)
except IndexError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO CATEGORIES_2 IN ' + url.encode('utf8'))
except AttributeError:
logger.error(strftime("%Y-%m-%d %H:%M:%S",
gmtime()) + ' / NO RULE MARKET_XPATH_CATEGORY_2')
if categories:
for cat in categories:
try:
name = cat.select(xpath_text).text().strip()
link = cat.select(xpath_link).text().strip()
except IndexError:
yield None
else:
hash_id = unicode(name) + ' / ' + unicode(link)
data = {
'name': name,
'link': link,
'hash': hash_id,
'parent_hash': parent_hash
}
yield Task('parse_category_models',
url=grab.make_url_absolute(link),
session=session,
data=data,
delay=randint(10, 50),
# delay=PROXY_TIMEOUT,
raw=True)
else:
yield None
def task_parse_category_models(self, grab, task):
# print grab.request_headers
# print ''
# print grab.response.headers
# print grab.response.body
# print ''
try:
proxy, proxy_userpwd, proxy_type = self.proxy
self.used_proxies[proxy] = datetime.now()
except Exception:
pass
if grab.response.code == 302:
self.tasks_failed += 1
# logger.error('FAILED: ' + task.url)
yield Task('parse_category_models',
url=task.url,
session=task.session,
data=task.data,
delay=randint(10, 50),
# delay=PROXY_TIMEOUT,
raw=True)
else:
logger.info(
strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' / ('+
str(self.items_total) + ') / parse_category_models / ' + task.url.encode('utf8'))
try:
task.data['link'] = grab.doc.select(yandex.MARKET_XPATH_CATEGORY_CONTENT).text().strip()
except IndexError:
task.data['link'] = None
self.save(task.data, BaseCategory, task.session, task.url)
for new_task in self.parse_categories(grab, grab.doc, task.data['hash'], 3, task.session, task.url):
yield new_task
else:
self.save(task.data, BaseCategory, task.session, task.url)
def task_parse_category_models_fallback(self, task):
self.tasks_failed += 1
logger.error('FAILED: ' + task.url)
# Log total count after finish
def shutdown(self):
logger.info('TOTAL INSERTED: ' + str(self.items_total))
logger.debug('TOTAL FAILED: ' + str(self.tasks_failed))
def change_proxy(self, task, grab):
"""
Assign new proxy from proxylist to the task.
"""
if task.use_proxylist and self.proxylist_enabled:
if self.proxy_auto_change:
proxy_item = None
while True:
proxy_item = self.proxylist.get_next()
proxy, proxy_userpwd, proxy_type = proxy_item
if proxy not in self.used_proxies.keys():
break
else:
# print self.used_proxies[proxy]
# print proxy
if datetime.now() - self.used_proxies[proxy] > timedelta(seconds=PROXY_TIMEOUT):
break
self.proxy = proxy_item
if self.proxy:
proxy, proxy_userpwd, proxy_type = self.proxy
grab.setup(proxy=proxy, proxy_userpwd=proxy_userpwd,
proxy_type=proxy_type)