# -*- coding: utf-8 -*-
from spiders.base import BaseSpider
from grab.spider import Task
import logging
import random
import re
from database import Candidate
class Parser(BaseSpider):
initial_urls = ['https://recruiter.totaljobs.com/login']
def task_initial(self, grab, task):
grab.setup(debug_post=True)
grab.setup(log_dir='log')
fields = grab.form_fields()
url = 'https://recruiter.totaljobs.com/login'
data = {'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': fields['__VIEWSTATE'],
'pagebody_0$left_0$txtUsername': self.settings.login,
'pagebody_0$left_0$txtPassword': self.settings.password,
'pagebody_0$left_0$btnSubmit': '',
'elqFormName': 'RecruiterLoginAndRegister',
'elqSiteID': fields['elqSiteID'],
'elqDefaultTargetURL': '',
'elqPost': '',
'elqCustomerGUID': '',
'elqCookieWrite': '0',
'pagebody_0$right_0$chkThirdPartyMarketing': 'on'
}
grab.setup(post=data, url=url)
yield Task('check', grab=grab)
def task_check(self, grab, task):
if not grab.search(u'FormsLogout.aspx'):
logging.error('Can not log in!')
url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx'
grab.setup(url=url)
yield Task('advanced_search', grab=grab)
def task_advanced_search(self, grab, task):
settings = self.settings
grab.set_input('ctl00$cphCentralPanel$ucSearchPart$ddlUpdated', str(settings.time_limit))
grab.set_input('ctl00$cphCentralPanel$ucSearchPart$ddlIndustries', str(settings.industry))
grab.submit(make_request=False)
yield Task('candidates', grab=grab)
def task_candidates(self, grab, task):
xpath = '//div[@class="row card-row"]'
for s in grab.doc.select(xpath):
name_xpath = './/a[@class="candidate-lnk"]'
email_xpath = './/a[contains(@class,"email-candidate")]'
name = s.select(name_xpath).text()
email = s.select(email_xpath).text()
if name.startswith('Candidate'):
logging.debug('Noname candidate: %s' % name)
continue
if Candidate.select().where(Candidate.name == name).exists():
logging.debug('Candidate: %s already exists' % name)
else:
self.num += 1
c = Candidate(name=name, email=email)
c.save()
limit = self.settings.limit
if limit and self.num > limit:
self.stop()
logging.debug('Limit %s has been reached. Script is stopping' % limit)
next_page = grab.doc.select('//li[@class="paging-forward"]')
if next_page:
link = next_page.select('./a/@href').text()
next_page_num = re.search('(\d+)\'\)$', link)
#fields = grab.doc.form_fields()
data = {
'__EVENTTARGET': 'ctl00$cphCentralPanel$ucSearchResults$pgrPager',
'__EVENTARGUMENT': str(next_page_num.group(1)),
'ctl00$cphCentralPanel$NewOrExistingSavedSearch':
'rdoNewSavedSearch'
}
removed_fields = ('ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlIndustries',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMaxValue',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMinValue',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlResidence',
'ctl00$cphCentralPanel$ddlExistingSearches',
'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlLanguages',
'ctl00$cphCentralPanel$btnSave')
grab.submit(make_request=False, extra_post=data)
new_post = []
for tuple in grab.config['post']:
if not tuple[0] in removed_fields:
new_post.append(tuple)
grab.config['post'] = new_post
yield Task('candidates', grab=grab)