coding utf-8 from spiders base import BaseSpider from grab spider impo

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
from spiders.base import BaseSpider
from grab.spider import Task
import logging
import random
import re
from database import Candidate
class Parser(BaseSpider):
initial_urls = ['https://recruiter.totaljobs.com/login']
def task_initial(self, grab, task):
grab.setup(debug_post=True)
grab.setup(log_dir='log')
fields = grab.form_fields()
url = 'https://recruiter.totaljobs.com/login'
data = {'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': fields['__VIEWSTATE'],
'pagebody_0$left_0$txtUsername': self.settings.login,
'pagebody_0$left_0$txtPassword': self.settings.password,
'pagebody_0$left_0$btnSubmit': '',
'elqFormName': 'RecruiterLoginAndRegister',
'elqSiteID': fields['elqSiteID'],
'elqDefaultTargetURL': '',
'elqPost': '',
'elqCustomerGUID': '',
'elqCookieWrite': '0',
'pagebody_0$right_0$chkThirdPartyMarketing': 'on'
}
grab.setup(post=data, url=url)
yield Task('check', grab=grab)
def task_check(self, grab, task):
if not grab.search(u'FormsLogout.aspx'):
logging.error('Can not log in!')
url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx'
grab.setup(url=url)
yield Task('advanced_search', grab=grab)
def task_advanced_search(self, grab, task):
settings = self.settings
data = {
'ctl00$cphCentralPanel$ucSearchPart$ddlIndustries': str(settings.industry),
'ctl00$cphCentralPanel$ucSearchPart$ddlUpdated': str(settings.time_limit)
}
grab.submit(make_request=False, extra_post=data)
yield Task('candidates', grab=grab)
def task_candidates(self, grab, task):
xpath = '//div[@class="row card-row"]'
for s in grab.doc.select(xpath):
name_xpath = './/a[@class="candidate-lnk"]'
email_xpath = './/a[contains(@class,"email-candidate")]'
name = s.select(name_xpath).text()
email = s.select(email_xpath).text()
if name.startswith('Candidate'):
logging.debug('Noname candidate: %s' % name)
continue
if Candidate.select().where(Candidate.name == name).exists():
logging.debug('Candidate: %s already exists' % name)
else:
self.num += 1
c = Candidate(name=name, email=email)
c.save()
limit = self.settings.limit
if limit and self.num > limit:
self.stop()
logging.debug('Limit %s has been reached. Script is stopping' % limit)
next_page = grab.doc.select('//li[@class="paging-forward"]')
if next_page:
link = next_page.select('./a/@href').text()
next_page_num = re.search('(\d+)\'\)$', link)
#fields = grab.doc.form_fields()
data = {
'__EVENTTARGET': 'ctl00$cphCentralPanel$ucSearchResults$pgrPager',
'__EVENTARGUMENT': str(next_page_num.group(1)),
'ctl00$cphCentralPanel$NewOrExistingSavedSearch':
'rdoNewSavedSearch'
}
removed_fields = ('ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlIndustries',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMaxValue',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMinValue',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlResidence',
'ctl00$cphCentralPanel$ddlExistingSearches',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlLanguages',
'ctl00$cphCentralPanel$btnSave')
grab.submit(make_request=False, extra_post=data)
new_post = []
for tuple in grab.config['post']:
if not tuple[0] in removed_fields:
new_post.append(tuple)
grab.config['post'] = new_post
yield Task('candidates', grab=grab)