# -*- coding: utf-8 -*- from spiders.base import BaseSpider from grab.spider import Task import logging import random from database import Candidate class Parser(BaseSpider): initial_urls = ['https://recruiter.totaljobs.com/login'] def task_initial(self, grab, task): #grab.setup(debug_post=True) #grab.setup(log_dir='log') fields = grab.doc.form_fields() url = 'https://recruiter.totaljobs.com/login' data = {'__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': fields['__VIEWSTATE'], 'pagebody_0$left_0$txtUsername': self.settings.login, 'pagebody_0$left_0$txtPassword': self.settings.password, 'pagebody_0$left_0$btnSubmit': '', 'elqFormName': 'RecruiterLoginAndRegister', 'elqSiteID': fields['elqSiteID'], 'elqDefaultTargetURL': '', 'elqPost': '', 'elqCustomerGUID': '', 'elqCookieWrite': '0', 'pagebody_0$right_0$chkThirdPartyMarketing': 'on' } grab.setup(post=data, url=url) yield Task('check', grab=grab) def task_check(self, grab, task): if not grab.search(u'FormsLogout.aspx'): logging.error('Can not log in!') return url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx' grab.setup(url=url) yield Task('search', grab=grab) def task_search(self, grab, task): grab.doc.set_input('__EVENTTARGET', 'ctl00$cphCentralPanel$ucCandidateSearchQuickSearchPart$btnAdvancedSearch') grab.doc.submit(make_request=False) yield Task('advanced_search', grab=grab) def task_advanced_search(self, grab, task): settings = self.settings grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rblSearchBy', str(settings.search_by)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlLocations', str(settings.location)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$txtBoolean', str(settings.keywords)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlIndustries', str(settings.industry)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlResidence', str(settings.country)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$ddlUpdated', str(settings.time_limit)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rptJobTypesHours$ctl00$ddlJobTypeHours', str(settings.job_type)) grab.doc.set_input('ctl00$cphCentralPanel$ucSearchBuilder$rptJobTypesHours$ctl01$ddlJobTypeHours', str(settings.job_part)) extra_post = {} if settings.salary != '0': extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlRate'] = settings.salary extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlMinValue'] = settings.salary_min extra_post['ctl00$cphCentralPanel$ucSearchBuilder$ddlMaxValue'] = settings.salary_max grab.doc.submit(make_request=False, extra_post=extra_post) if not settings.search_200: post = dict(grab.config['post']) del post['ctl00$cphCentralPanel$ucSearchBuilder$chkLimitSearch'] grab.setup(post=post) yield Task('candidates', grab=grab) def task_candidates(self, grab, task): xpath = '//td[@class="cellCandidate"]/a' for s in grab.doc.select(xpath): name = s.text() if name.startswith('Candidate'): logging.debug('Noname candidate: %s' % name) continue if Candidate.select().where(Candidate.name == name).exists(): logging.debug('Candidate: %s already exists' % name) else: input = s.select('@id').text() input = input.replace('_', '$') grab.set_input('__EVENTTARGET', input) grab.submit(make_request=False) yield Task('candidate', grab=grab, c_name=name) if grab.search(u'/Recruitment/img/next.gif'): x = str(random.randint(1, 90)) y = str(random.randint(1, 90)) data = {'ctl00$cphCentralPanel$ucSearchResults$pgrPager$btnForward.x': x, 'ctl00$cphCentralPanel$ucSearchResults$pgrPager$btnForward.y': y} grab.submit(make_request=False, extra_post=data) yield Task('candidates', grab=grab) def task_candidate(self, grab, task): xpath = '//a[@target="winSendEmail" and contains(text(), "@")]' s = grab.doc.select(xpath) if s.exists(): self.num += 1 email = s.text() c = Candidate(name=task.c_name, email=email) c.save() limit = self.settings.limit if limit and self.num > limit: self.stop() logging.debug('Limit %s has been reached. Script is stopping' % limit)