class ProxyChecker Spider def setup self from_list to_list proxy_type

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class ProxyChecker(Spider):
def setup(self, from_list, to_list, proxy_type='http'):
self.proxy_type = proxy_type
self.from_list = from_list
self.to_list = to_list
def prepare(self):
self.proxies = list()
def shutdown(self):
with open(self.to_list, 'w') as f:
for proxy in self.proxies:
f.write(proxy)
f.write('\n')
def task_generator(self):
with open(self.from_list, 'r') as f:
for line in f:
g = Grab()
g.setup_with_proxyline(line.strip(), proxy_type=self.proxy_type)
g.setup(reuse_cookies=True)
g.setup(url='http://yelp.com')
yield Task('yelp', grab=g)
def task_yelp(self, grab, task):
if grab.response.code != 200:
return
try:
assert 'About Yelp' in grab.response.body
except AssertionError:
return
if grab.config['proxy_userpwd']:
self.proxies.append('%s:%s' % (grab.config['proxy'], grab.config['proxy_userpwd']))
else:
self.proxies.append('%s' % (grab.config['proxy'], ))