1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
#!.env/bin/python
# -*- coding: utf-8 -*-
import re
from datetime import datetime, timedelta
from urlparse import urljoin
from urlparse import urlsplit
from django.conf import settings
from parser.util import log_errors
from parser.models import Error, ErrorType
from parser.base import BaseParser
class Parser(BaseParser):
base_url = 'http://vigoda.ru/'
source_name = 'vigoda.ru'
def iter_cities(self):
"Iterate over available cities."
self.g.go('http://moscow.vigoda.ru')
for elem in self.g.xpath('//div[@id="otherCity"]//div[@class="city"]/a'):
yield {'url': self.absolute_url(elem.get('href')),
'title': elem.text_content().strip(),
}
def iter_city_deals(self, city_url):
"""
Iterate over deals available in the city.
Algo:
1) Go to city's home page using url of city retrieved from the cities list
2) Process redirect and find the real address of city's page
3) Process the Buy Button code and find url of the deal
placed on the city's home page. It is possible that this button does not exist.
4) Parse urls of extra deals placed in right sidebar
"""
self.g.go(city_url)
try:
buy_block = self.g.tree.xpath(u'//a[@class="buy-btn"]')[0]
except IndexError:
pass
else:
yield {
'url': city_url + '/?offer_id=%s' % buy_block.get('rel'),
'small_image_url': None,
}
# Process extra deals
for action in self.g.tree.xpath('//div[@class="past-action two-action"]'):
url = city_url + action.xpath('./a')[0].get('href')
yield {
'url': url,
'small_image_url': None,
}
def process_deal(self, deal):
"""
Parse deal's details.
"""
self.g.go(deal['url'])
base_url = 'http://%s' % urlsplit(deal['url']).hostname
"""
title
big_image_url
price
price_real
discount_percent
discount
active
sold_count
"""
buy_block = self.g.itercss('.buy-btn')
if len(buy_block):
deal['active'] = True
rex = re.compile(r'var dthen = new Date \((\d+)')
deal['expiry'] = datetime.fromtimestamp(int(rex.search(self.g.response.body).group(1)))
else:
deal['active'] = False
# Process other properties only if deal is active
if deal['active']:
title = self.g.css_text('.discount-today-right-in h2')
title = re.compile(ur'\d+\sруб\.\sвместо\s\d+ руб\.', re.U).sub('', title).strip()
deal['title'] = title
deal['big_image_url'] = base_url + self.g.css('.large-photo img').get('src')
deal['price'] = self.g.css_number('.price a strong')
try:
deal['price_real'] = self.g.css_number('.discount-today .value strong')
except AttributeError:
deal['price_real'] = 0
deal['discount_percent'] = self.g.css_number('.discount-today .your-discount strong')
deal['discount'] = self.g.css_number('.discount-today .profit strong')
try:
deal['sold_count'] = self.g.css_number('.discount-today-left-5 span')
except IndexError, ValueError:
deal['sold_count'] = 0
return deal

Яндекс.Метрика