from scrapy spider import Spider from scrapy selector import Selector

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy_webdriver.http import WebdriverRequest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from workopolis.items import Node
from scrapy import log
import time
import re
class sjobs(Spider):
name = "sjobs"
driver= webdriver.PhantomJS('phantomjs')
start_urls = [
'https://sjobs.brassring.com/TGWebHost/searchopenings.aspx?partnerid=25957&siteid=5141'
]
url_app = 'https://sjobs.brassring.com/TGWebHost/jobdetails.aspx?jobId=jobcode&type=search&JobReqLang=1&recordstart=1&JobSiteId=5141&JobSiteInfo=71789_5141&GQId=349'
def parse(self, response):
hxs = Selector(response)
driver = self.driver
driver.get(response.url)
driver.find_element_by_id("ctl00_MainContent_submit1" ).click()
items = []
try:
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, "MTable"))
)
except Exception, e:
log.msg("table not found waiting to load", level=log.INFO)
finally:
rows = driver.find_elements_by_xpath('//*[@id="ctl00_MainContent_GridFormatter_datatable"]/table/tbody[2]//tr')
for tr in rows:
item = Node()
item['title'] = tr.find_element_by_xpath(".//td[2]").text
url = tr.find_element_by_xpath(".//td[2]//a").get_attribute('href')
item['job_id'] = re.search('jobId=([0-9]+)', url).groups()[0]
item['apply_url'] = self.url_app.replace('jobcode',item['job_id'])
print item['apply_url']
d_url = item['apply_url']
#request = WebdriverRequest(d_url, callback=self.parse_details)
#request.meta['item'] = item
#items.append(request)
except Exception, e:
print e
print Exception
for item in items:
yield item
driver.close()
def parse_details(self,response):
tr = Selector(response)
item = response.meta['item']
item['title'] = tr.xpath("string(.//*[contains(@id='Job Title']").extract()[0]
item['department'] = tr.xpath(".//*[@id='Department']").extract()[0]
item['id'] = tr.xpath("string(.//*[contains(@id='Auto req ID']").extract()[0]
item['update'] = tr.xpath("string(.//*[contains(@id='Date updated']").extract()[0]
location = tr.xpath("string(.//*[contains(@id='Work Location')])").extract()[0]
if len(location) > 0:
item['location'] = location
location = location.split('-')
if len(location) == 3:
item['city'] = location[2]
if len(location) > 1:
item['state'] = location[1]
if len(location) > 0:
item['country'] = location[0]
return item