coding utf-8 from spiders base import BaseSpider from grab spider impo

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
from spiders.base import BaseSpider
from grab.spider import Task
import logging
import random
import re
import json
from database import Candidate
class Parser(BaseSpider):
initial_urls = ['https://recruiter.totaljobs.com/login']
def task_initial(self, grab, task):
grab.setup(debug_post=True)
grab.setup(log_dir='log')
fields = grab.form_fields()
url = 'https://recruiter.totaljobs.com/login'
data = {'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': fields['__VIEWSTATE'],
'pagebody_0$left_0$txtUsername': self.settings.login,
'pagebody_0$left_0$txtPassword': self.settings.password,
'pagebody_0$left_0$btnSubmit': '',
'elqFormName': 'RecruiterLoginAndRegister',
'elqSiteID': fields['elqSiteID'],
'elqDefaultTargetURL': '',
'elqPost': '',
'elqCustomerGUID': '',
'elqCookieWrite': '0',
'pagebody_0$right_0$chkThirdPartyMarketing': 'on'
}
grab.setup(post=data, url=url)
yield Task('check', grab=grab)
def task_check(self, grab, task):
if not grab.search(u'FormsLogout.aspx'):
logging.error('Can not log in!')
url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx'
grab.setup(url=url)
yield Task('advanced_search', grab=grab)
def task_advanced_search(self, grab, task):
settings = self.settings
grab.set_input('ctl00$cphCentralPanel$ucSearchPart$ddlUpdated', str(settings.time_limit))
grab.submit(make_request=False)
yield Task('candidates', grab=grab)
def task_candidates(self, grab, task):
xpath = '//div[@class="row card-row"]'
for s in grab.doc.select(xpath):
name_xpath = './/a[@class="candidate-lnk"]'
email_xpath = './/a[contains(@class,"email-candidate")]'
name = s.select(name_xpath).text()
email = s.select(email_xpath).text()
if name.startswith('Candidate'):
logging.debug('Noname candidate: %s' % name)
continue
if Candidate.select().where(Candidate.name == name).exists():
logging.debug('Candidate: %s already exists' % name)
else:
self.num += 1
c = Candidate(name=name, email=email)
c.save()
limit = self.settings.limit
if limit and self.num > limit:
self.stop()
logging.debug('Limit %s has been reached. Script is stopping' % limit)
next_page = grab.doc.select('//li[@class="paging-forward"]')
if next_page:
link = next_page.select('./a/@href').text()
next_page_num = re.search('(\d+)\'\)$', link)
fields = grab.form_fields()
data = {
'__EVENTTARGET': 'ctl00$cphCentralPanel$ucSearchResults$pgrPager',
'__EVENTARGUMENT': str(next_page_num.group(1)),
}
grab.submit(make_request=False, extra_post=data)
yield Task('candidates', grab=grab)