# -*- coding: utf-8 -*- from spiders.base import BaseSpider from grab.spider import Task from grab import Grab import logging import random import re import json from database import Candidate class Parser(BaseSpider): initial_urls = ['https://recruiter.totaljobs.com/login'] def task_initial(self, grab, task): grab.setup(debug_post=True) grab.setup(log_dir='log') fields = grab.doc.form_fields() url = 'https://recruiter.totaljobs.com/login' data = {'__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': fields['__VIEWSTATE'], 'pagebody_0$left_0$txtUsername': self.settings.login, 'pagebody_0$left_0$txtPassword': self.settings.password, 'pagebody_0$left_0$btnSubmit': '', 'elqFormName': 'RecruiterLoginAndRegister', 'elqSiteID': fields['elqSiteID'], 'elqDefaultTargetURL': '', 'elqPost': '', 'elqCustomerGUID': '', 'elqCookieWrite': '0', 'pagebody_0$right_0$chkThirdPartyMarketing': 'on' } grab.setup(post=data, url=url) yield Task('check', grab=grab) def task_check(self, grab, task): if not grab.doc.text_search(u'FormsLogout.aspx'): logging.error('Can not log in!') url = 'https://recruiter.totaljobs.com/Recruitment/CandidateSearch/CandidateSearch.aspx' grab.setup(url=url) yield Task('advanced_search', grab=grab) def task_advanced_search(self, grab, task): settings = self.settings grab.doc.set_input('ctl00$cphCentralPanel$ucSearchPart$ddlUpdated', str(settings.time_limit)) grab.doc.submit(make_request=False) yield Task('candidates', grab=grab) def task_candidates(self, grab, task): xpath = '//div[@class="row card-row"]' for s in grab.doc.select(xpath): name_xpath = './/a[@class="candidate-lnk"]' email_xpath = './/a[contains(@class,"email-candidate")]' name = s.select(name_xpath).text() email = s.select(email_xpath).text() if name.startswith('Candidate'): logging.debug('Noname candidate: %s' % name) continue if Candidate.select().where(Candidate.name == name).exists(): logging.debug('Candidate: %s already exists' % name) else: self.num += 1 c = Candidate(name=name, email=email) c.save() limit = self.settings.limit if limit and self.num > limit: self.stop() logging.debug('Limit %s has been reached. Script is stopping' % limit) next_page = grab.doc.select('//li[@class="paging-forward"]') if next_page: link = next_page.select('./a/@href').text() next_page_num = re.search('(\d+)\'\)$', link) #fields = grab.doc.form_fields() data = { '__EVENTTARGET': 'ctl00$cphCentralPanel$ucSearchResults$pgrPager', '__EVENTARGUMENT': str(next_page_num.group(1)), 'ctl00$cphCentralPanel$NewOrExistingSavedSearch': 'rdoNewSavedSearch' } removed_fields = ('ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlIndustries', 'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMaxValue', 'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlMinValue', 'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlResidence', 'ctl00$cphCentralPanel$ddlExistingSearches', 'ctl00$cphCentralPanel$ucSearchResults$ucRefineSearch$ddlLanguages', 'ctl00$cphCentralPanel$btnSave') grab.doc.submit(make_request=False, extra_post=data) new_post = [] for tuple in grab.config['post']: if not tuple[0] in removed_fields: new_post.append(tuple) grab.config['post'] = new_post yield Task('candidates', grab=grab)