#!.env/bin/python
# -*- coding: utf-8 -*-
import re
from datetime import datetime, timedelta
from urlparse import urljoin
from urlparse import urlsplit
from django.conf import settings
from parser.util import log_errors
from parser.models import Error, ErrorType
from parser.base import BaseParser, DealNotFound
class Parser(BaseParser):
base_url = 'http://bigbuzzy.ru/'
source_name = 'bigbuzzy.ru'
# It seems that I can parse
# all pages without loggging in to the site
#def __init__(self, *args, **kwargs):
#super(Parser, self).__init__(*args, **kwargs)
#self.g.go(self.base_url)
#self.g.set_input('email', settings.BIGBUZZY_USERNAME)
#self.g.set_input('password', settings.BIGBUZZY_PASSWORD)
#self.g.submit()
def iter_cities(self):
"Iterate over available cities."
self.g.go('http://bigbuzzy.ru/moscow/')
# yield current city because its link is
# outside the other cities links
elem = self.g.css('a.current_chooser')
yield {'url': self.absolute_url(elem.get('href')),
'title': elem.text_content().strip(),
}
for elem in self.g.itercss('#city_chooser li a'):
yield {'url': self.absolute_url(elem.get('href')),
'title': elem.text_content().strip(),
}
def iter_city_deals(self, city_url):
"""
Iterate over deals available in the city.
"""
self.g.go(city_url)
base_url = city_url
for elem in self.g.itercss('.homepage-offer .offer-meta h2 a'):
yield {'url': self.absolute_url(elem.get('href'))}
urls = []
# Parse pagination links urls = []
for elem in self.g.itercss('.page-paging-list li a'):
urls.append(elem.get('href'))
for ulr in urls:
self.g.go(base_url + url)
for elem in self.g.itercss('.homepage-offer .offer-meta h2 a'):
yield {'url': self.absolute_url(elem.get('href'))}
def process_deal(self, deal):
"""
Parse deal's details.
"""
now = datetime.now()
self.g.go(deal['url'])
if self.g.response.code == 404:
raise DealNotFound()
"""
title
big_image_url
price
price_real
discount_percent
discount
active
sold_count
"""
buy_block = self.g.itercss('.button-buy-offer')
if len(buy_block):
deal['active'] = True
try:
seconds = (int(self.g.css_number('.timer-seconds')) +
60 * int(self.g.css_number('.timer-minutes')) +
60 * 60 * int(self.g.css_number('.timer-hours')) +
60 * 60 * 24 * int(self.g.css_number('.timer-days')))
except IndexError:
# If no expiry date then
# just set two days!
seconds = 60 * 60 * 24 * 2
deal['expiry'] = now + timedelta(seconds=seconds)
else:
deal['active'] = False
# Process other properties only if deal is active
if deal['active']:
deal['title'] = self.g.css_text('.offer-meta h2')
deal['big_image_url'] = self.absolute_url(self.g.css('.offer-picture img').get('src'))
try:
deal['price_real'] = self.g.css_number('.offer-price-initial')
except IndexError:
deal['price_real'] = 0
deal['discount'] = 0
try:
deal['discount_percent'] = self.g.css_number('.offer-price-discount')
except IndexError:
deal['discount_percent'] = 0
try:
deal['price'] = self.g.css_number('.offer-price-value')
except IndexError:
deal['price'] = 0
try:
deal['sold_count'] = self.g.css_number('.offer-coupons-text')
except IndexError, ValueError:
deal['sold_count'] = 0
return deal
bigbuzzy.ru parser