from grab spider import Spider Task from grab import Grab from pprint

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from grab.spider import Spider, Task
from grab import Grab
from pprint import pprint
from project.database import db
class BaseBookingSpider(object):
def parse_country_id(self, url):
return url.split('/country/')[1].split('.')[0]
def parse_city_id(self, url):
return url.split('/city/')[1].split('.')[0]
def build_country_url(self, cid):
return 'http://www.booking.com/country/%s.html' % cid
def strip_url_query(self, url):
return url.split('?')[0]
def build_city_url(self, city_id):
return 'http://www.booking.com/city/%s.html' % city_id
def parse_district_id(self, url):
return url.split('/district/')[1].split('.')[0]
class GeoSpider(BaseBookingSpider, Spider):
base_url = 'http://www.booking.com/'
def task_generator(self):
yield Task('destinations',
url='http://www.booking.com/destination.en-us.html')
def prepare(self):
db.country.drop()
db.city.drop()
def task_destinations(self, grab, task):
for elem in grab.doc('//h4/following-sibling::'
'div[@class="flatList"]/a'):
url = self.strip_url_query(elem.attr('href'))
country = {
'_id': self.parse_country_id(url),
'name': elem.text(),
}
country['url'] = self.build_country_url(country['_id'])
db.country.save(country)
yield Task('country', url=url, country=country)
break
def task_country(self, grab, task):
for elem in grab.doc('//h3[contains(text(), "Cities in")]'
'/following-sibling::table[1]'
'//a[contains(@href, "/city/")]'):
url = self.build_city_url(self.parse_city_id(elem.attr('href')),
task.country['_id'])
yield Task('city', url=url, country=task.country)
break
def parse_districts(self, grab):
result = []
for elem in grab.doc('//h3[contains(text(), "Districts")]'
'/following-sibling::table[1]'
'//a[contains(@href, "/district/")]'):
district = {
'_id': self.parse_district_id(elem.attr('href')),
}
result.append(district)
return result
def task_city(self, grab, task):
city = {
'_id': self.parse_city_id(task.url),
'country_id': task.country['_id'],
'name': grab.doc('//div[@id="breadcrumb"]'
'/div[last()]/text()').text(),
'booking_id': int(grab.doc.rex_text("b_ufi : '([^']+)',")),
'lat': grab.doc.rex_text('booking\.env\.b_map_center_latitude'
' = ([^;]+)'),
'districts': self.parse_districts(grab),
}
db.city.save(city)
pprint(city)