coding utf-8 from spiders base import BaseSpider from grab spider impo

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
from spiders.base import BaseSpider
from grab.spider import Task
import logging
import random
from database import Candidate
class Parser(BaseSpider):
initial_urls = ['https://recruiter.totaljobs.com/login']
def task_initial(self, grab, task):
#grab.setup(debug_post=True)
#grab.setup(log_dir='log')
fields = grab.doc.form_fields()
url = 'https://recruiter.totaljobs.com/login'
data = {'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': fields['__VIEWSTATE'],
'pagebody_0$left_0$txtUsername': self.settings.login,
'pagebody_0$left_0$txtPassword': self.settings.password,
'pagebody_0$left_0$btnSubmit': '',
'elqFormName': 'RecruiterLoginAndRegister',
'elqSiteID': fields['elqSiteID'],
'elqDefaultTargetURL': '',
'elqPost': '',
'elqCustomerGUID': '',
'elqCookieWrite': '0',
'pagebody_0$right_0$chkThirdPartyMarketing': 'on'
}
grab.setup(post=data, url=url)
yield Task('check', grab=grab)
def task_check(self, grab, task):
if not grab.search(u'FormsLogout.aspx'):
logging.error('Can not log in!')
return
url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx'
grab.setup(url=url)
yield Task('search', grab=grab)
def task_search(self, grab, task):
grab.doc.set_input('__EVENTTARGET', 'ctl00$cphCentralPanel$ucCandidateSearchQuickSearchPart$btnAdvancedSearch')
grab.doc.submit(make_request=False)
yield Task('advanced_search', grab=grab)
def task_advanced_search(self, grab, task):
settings = self.settings
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rblSearchBy', str(settings.search_by))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlLocations', str(settings.location))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$txtBoolean', str(settings.keywords))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlIndustries', str(settings.industry))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlResidence', str(settings.country))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlUpdated', str(settings.time_limit))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rptJobTypesHours$ctl00$ddlJobTypeHours',
str(settings.job_type))
grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rptJobTypesHours$ctl01$ddlJobTypeHours',
str(settings.job_part))
extra_post = {}
if settings.salary != '0':
extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlRate'] = settings.salary
extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlMinValue'] = settings.salary_min
extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlMaxValue'] = settings.salary_max
grab.doc.submit(make_request=False, extra_post=extra_post)
if not settings.search_200:
post = dict(grab.config['post'])
del post['ctl00$cphCentralPanel$ucSearchBuilder$chkLimitSearch']
grab.setup(post=post)
yield Task('candidates', grab=grab)
def task_candidates(self, grab, task):
xpath = '//td[@class="cellCandidate"]/a'
for s in grab.doc.select(xpath):
name = s.text()
if name.startswith('Candidate'):
logging.debug('Noname candidate: %s' % name)
continue
if Candidate.select().where(Candidate.name == name).exists():
logging.debug('Candidate: %s already exists' % name)
else:
input = s.select('@id').text()
input = input.replace('_', '$')
grab.set_input('__EVENTTARGET', input)
grab.submit(make_request=False)
yield Task('candidate', grab=grab, c_name=name)
if grab.search(u'/Recruitment/img/next.gif'):
x = str(random.randint(1, 90))
y = str(random.randint(1, 90))
data = {'ctl00$cphCentralPanel$ucSearchResults$pgrPager$btnForward.x': x,
'ctl00$cphCentralPanel$ucSearchResults$pgrPager$btnForward.y': y}
grab.submit(make_request=False, extra_post=data)
yield Task('candidates', grab=grab)
def task_candidate(self, grab, task):
xpath = '//a[@target="winSendEmail" and contains(text(), "@")]'
s = grab.doc.select(xpath)
if s.exists():
self.num += 1
email = s.text()
c = Candidate(name=task.c_name, email=email)
c.save()
limit = self.settings.limit
if limit and self.num > limit:
self.stop()
logging.debug('Limit %s has been reached. Script is stopping' % limit)