usr bin python coding utf-8 from optparse import OptionParser from gra

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/python
# -*- coding: utf-8 -*-
from optparse import OptionParser
from grab import Grab
from grab.spider import Spider, Task
from grab.tools.logs import default_logging
from grab.spider.base import logger_verbose
import logging
import pycurl
from spiders.categories import CategorySpider
from spiders.items import ItemSpider
from spiders.details import DetailSpider
from spiders.market_categories import MarketCategorySpider
from config import spider_params, Session
if __name__ == '__main__':
default_logging(grab_log='/tmp/grab.log', level=20, mode='a', propagate_network_logger=True,
network_log='/tmp/grab.network.log')
parser = OptionParser()
# command line options
parser.add_option("-i", "--items", action="store_true",
dest="items", default=False)
parser.add_option("-d", "--details", action="store_true",
dest="details", default=False)
parser.add_option("-y", "--yandex_categories", action="store_true",
dest="yandex_categories", default=False)
options, args = parser.parse_args()
if options.items:
print "Scape items"
bot = ItemSpider(**spider_params())
elif options.details:
print "Scrape details"
bot = DetailSpider(**spider_params())
elif options.yandex_categories:
print "Scrape market categories"
bot = MarketCategorySpider(**spider_params())
else:
print "Scrape categories"
bot = CategorySpider(**spider_params())
# bot.config['body_maxsize'] = 5000
bot.load_proxylist('proxy_16_03_2015.txt', 'text_file', auto_change=True)
bot.setup_grab(timeout=100, connect_timeout=20,
debug=True,
encoding='gzip',
log_dir='/tmp/responses',
reuse_cookies=False,
user_agent_file='user-agent.txt',
headers={'Proxy-Connection': '',
'Accept-Encoding': 'gzip, deflate, sdch',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Keep-Alive':''}
)
try:
bot.run()
except KeyboardInterrupt:
pass
print bot.render_stats()