To install ioweb use command pip install -U ioweb To run this crawler

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
To install ioweb use command: "pip install -U ioweb"
To run this crawler put it into any file inside
"./crawler/" directory.
Make "./crawlers" directory package i.e. put "__init__.py" inside it.
Then run crawler: "crawl HostCrawler -t100"
"""
from ioweb import Crawler, Request
class HostCrawler(Crawler):
def run_hook(self):
# Any custom things
self.retry_limit = 30
self.stat.speed_keys += ['foo']
def submit_task_hook(self, req):
# Here you can customize every request
# before putting into task queue
req.setup(
timeout=30,
connect_timeout=10,
)
# Or something like that
if req.meta.get('ips'):
#ip = choice(req.meta['ips'])
#req.setup(
# resolve={
# req.meta['host']: ip,
# },
#)
#req.meta['ip_used'] = ip
pass
def handler_page(self, req, res):
self.stat.inc('host-ok')
def rejected_page(self, req, res):
self.stat.inc('host-rejected')
# here you can handle
# request that was rejected i.e.
# number of retries reached limit
def task_generator(self):
yield Request(
name='page',
url=url,
content_read_limit=self.content_read_limit,
meta={
# Any meta data
}
)