coding utf8 import requests import requests_cache from urlparse import

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# coding: utf8
import requests
import requests_cache
from urlparse import urlparse
import re
from bs4 import BeautifulSoup
requests_cache.install_cache('tiu')
def parseCategory(url, depth = 0, parent_id = 0):
ignore = [u'Производитель', u'Страна производитель']
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
for category in soup.findAll('a', 'b-categories-top__item-name'):
categoryUrl = 'http://tiu.ru' + category['href'].strip()
count = category.findPreviousSibling('span', 'b-categories-top__item-count')
if count:
count = int(re.sub('\D', '', count.text.strip()))
else:
count = 0
print "%s%s\t%d" % ('\t' * depth, category.text.strip(), count)
parseCategory(categoryUrl, depth + 1, parent_id)
categories = soup.find('div', 'b-facet__body_type_categories')
if categories:
for category in categories.findAll('a', 'b-facet__value-link'):
categoryUrl = 'http://tiu.ru' + category['href'].strip()
count = category.findNextSibling('span', 'b-facet__value-count')
if count:
count = int(re.sub('\D', '', count.text.strip()))
else:
count = 0
print "%s%s\t%d" % ('\t' * depth, category.text.strip(), count)
parseCategory(categoryUrl, depth + 1, parent_id)
params = soup.findAll('div', 'b-facet__body_type_attributes')
if not params:
return
for param in params:
paramName = param.find('span', 'h-cursor-pointer').text.strip()
if paramName not in ignore:
print "%s* %s" % ('\t' * depth, paramName)
parseCategory('http://tiu.ru/b2b')