from pprint import pprint import gzip from urllib parse import urlspli

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from pprint import pprint
import gzip
from urllib.parse import urlsplit
from collections import defaultdict
from ioweb import Crawler, Request, DataNotValid, Response
from project.database import db
from project import settings
GZIP_MAGIC_NUMBER = b'\x1f\x8b'
class SitemapDownloadCrawler(Crawler):
def init_hook(self):
self.stat.speed_keys += ['org-ok', 'org-not-found']
if settings.PROXYLIST:
self.load_proxylist(**settings.PROXYLIST)
self.error_logger.add_handler('file')
def setup_request_hook(self, transport, req):
req.setup(timeout=30)
def get_sitemap_type(self, res):
if res.headers and 'text/plain' in res.headers.get('content-type', ''):
return 'text-urls'
elif res.css('urlset'):
return 'urls'
elif res.css('sitemapindex'):
return 'index'
else:
return None
def handler_sitemap(self, req, res):
if res.status == 404:
self.stat.inc('sitemap-not-found')
db.sitemap.update_one(
{'_id': req.meta['id']},
{'$set': {
'_status_check': 'not-found',
}},
)
elif (
not res.data.startswith(GZIP_MAGIC_NUMBER)
and 'https://yandex.ru/captcha/voice' in res.text
):
self.stat.inc('error-captcha')
raise DataNotValid
else:
if res.data.startswith(GZIP_MAGIC_NUMBER):
norm_res = Response()
try:
norm_res.write_bytes_body(gzip.decompress(res.data))
except EOFError as ex:
self.stat.inc('error-gzip-fail')
raise DataNotValid
else:
norm_res = res
try:
stype = self.get_sitemap_type(norm_res)
except Exception as ex:
if 'ElementTree not initialized, missing root' in str(ex):
with open('var/tree.html', 'wb') as out:
out.write(norm_res.data)
raise
if not stype:
self.stat.inc('sitemap-no-index-tag')
raise DataNotValid
else:
self.stat.inc('sitemap-%s' % stype)
prefix_reg = {}
urls = []
if stype == 'index':
for elem in norm_res.css('sitemapindex sitemap url'):
self.stat.inc('sitemap-link-found')
url = elem.text()
urls.append(url)
db.sitemap.update_one(
{'_id': url},
{'$setOnInsert': {
'_status_check': 'new',
}},
)
elif stype == 'urls':
for elem in norm_res.css('urlset loc'):
urls.append(elem.text())
elif stype == 'urls':
for elem in norm_res.css('urlset loc'):
urls.append(elem.text())
elif stype == 'text-urls':
urls = norm_res.text.splitlines()
else:
raise Exception('Invalid sitemap type: %s' % stype)
prefix_reg = defaultdict(int)
for url in urls:
url_items = urlsplit(url)
prefix = '%s/%s' % (
url_items.netloc,
url_items.path.lstrip('/').split('/')[0],
)
prefix_reg[prefix] += 1
#pprint(list(prefix_reg.items()))
self.stat.inc('url-item-%s' % stype, len(urls))
db.sitemap.update_one(
{'_id': req.meta['id']},
{'$set': {
'_status_check': 'ok',
'sitemap_type': stype,
'content': res.data,
'num_urls': len(urls),
'prefix_reg': list(prefix_reg.items()),
}},
)
def task_generator(self):
query = {
'_status_check': 'new',
}
while True:
obj = db.sitemap.find_one_and_update(
query,
{'$set': {
'_status_check': 'inuse',
}},
)
if not obj:
break
else:
yield Request(
name='sitemap',
url=obj['_id'],
meta={
'id': obj['_id'],
},
)