Python
16 May 2011
 

bigbuzzy.ru parser

 
 
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!.env/bin/python
# -*- coding: utf-8 -*-
import re
from datetime import datetime, timedelta
from urlparse import urljoin
from urlparse import urlsplit
from django.conf import settings
from parser.util import log_errors
from parser.models import Error, ErrorType
from parser.base import BaseParser, DealNotFound
class Parser(BaseParser):
base_url = 'http://bigbuzzy.ru/'
source_name = 'bigbuzzy.ru'
# It seems that I can parse
# all pages without loggging in to the site
#def __init__(self, *args, **kwargs):
#super(Parser, self).__init__(*args, **kwargs)
#self.g.go(self.base_url)
#self.g.set_input('email', settings.BIGBUZZY_USERNAME)
#self.g.set_input('password', settings.BIGBUZZY_PASSWORD)
#self.g.submit()
def iter_cities(self):
"Iterate over available cities."
self.g.go('http://bigbuzzy.ru/moscow/')
# yield current city because its link is
# outside the other cities links
elem = self.g.css('a.current_chooser')
yield {'url': self.absolute_url(elem.get('href')),
'title': elem.text_content().strip(),
}
for elem in self.g.itercss('#city_chooser li a'):
yield {'url': self.absolute_url(elem.get('href')),
'title': elem.text_content().strip(),
}
def iter_city_deals(self, city_url):
"""
Iterate over deals available in the city.
"""
self.g.go(city_url)
base_url = city_url
for elem in self.g.itercss('.homepage-offer .offer-meta h2 a'):
yield {'url': self.absolute_url(elem.get('href'))}
urls = []
# Parse pagination links urls = []
for elem in self.g.itercss('.page-paging-list li a'):
urls.append(elem.get('href'))
for ulr in urls:
self.g.go(base_url + url)
for elem in self.g.itercss('.homepage-offer .offer-meta h2 a'):
yield {'url': self.absolute_url(elem.get('href'))}
def process_deal(self, deal):
"""
Parse deal's details.
"""
now = datetime.now()
self.g.go(deal['url'])
if self.g.response.code == 404:
raise DealNotFound()
"""
title
big_image_url
price
price_real
discount_percent
discount
active
sold_count
"""
buy_block = self.g.itercss('.button-buy-offer')
if len(buy_block):
deal['active'] = True
try:
seconds = (int(self.g.css_number('.timer-seconds')) +
60 * int(self.g.css_number('.timer-minutes')) +
60 * 60 * int(self.g.css_number('.timer-hours')) +
60 * 60 * 24 * int(self.g.css_number('.timer-days')))
except IndexError:
# If no expiry date then
# just set two days!
seconds = 60 * 60 * 24 * 2
deal['expiry'] = now + timedelta(seconds=seconds)
else:
deal['active'] = False
# Process other properties only if deal is active
if deal['active']:
deal['title'] = self.g.css_text('.offer-meta h2')
deal['big_image_url'] = self.absolute_url(self.g.css('.offer-picture img').get('src'))
try:
deal['price_real'] = self.g.css_number('.offer-price-initial')
except IndexError:
deal['price_real'] = 0
deal['discount'] = 0
try:
deal['discount_percent'] = self.g.css_number('.offer-price-discount')
except IndexError:
deal['discount_percent'] = 0
try:
deal['price'] = self.g.css_number('.offer-price-value')
except IndexError:
deal['price'] = 0
try:
deal['sold_count'] = self.g.css_number('.offer-coupons-text')
except IndexError, ValueError:
deal['sold_count'] = 0
return deal