import time import json from datetime import datetime from copy import

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import time
import json
from datetime import datetime
from copy import copy
from base64 import b64encode
from pymongo import UpdateOne
from ioweb import Crawler, Request
from ioweb.error import DataNotValid
import OpenSSL.crypto
from project.database import db
class PageContentCrawler(Crawler):
dataop_threshold = {
'page': {
'number': 5,#100,
},
}
max_body_size = (2 * 1024 * 1024)
def init_hook(self):
self.retry_limit = 2
self.stat.speed_keys += ['crawler:request-ok', 'crawler:request-fail']
def task_generator(self):
#recent_pk = None
while True:
page = db.page.find_one_and_update(
{
'_status_content': 'new',
},
{'$set': {
'_status_content': 'inuse',
}},
)
if page:
yield Request(
name='page',
url=page['url'],
verify=False,
meta={
'pid': page['_id'],
},
)
#recent_pk = page['_id']
else:
break
def build_page_data(self, res):
return {
'text': (
res.bytes_body.decode(
'utf-8', errors='replace'
)[:self.max_body_size]
if res.bytes_body is not None else None
),
'headers': res.headers,
'status': res.status,
'cert': self.dump_cert(res.cert) if res.cert else None,
'error': str(res.error) if res.error else None,
}
def handler_page(self, req, res):
self.stat.inc('page-ok')
self.enq_dataop('page', UpdateOne(
{'_id': req.meta['pid']},
{'$set': {
'_status_content': 'ok',
'_content_date': datetime.utcnow(),
'data': self.build_page_data(res),
}},
))
def rejected_page(self, req, res):
self.stat.inc('page-rejected')
self.enq_dataop('page', UpdateOne(
{'_id': req.meta['pid']},
{'$set': {
'_status_content': 'fail',
'_content_date': datetime.utcnow(),
'data': self.build_page_data(res),
}},
))
def dataop_handler_page(self, ops):
db.page.bulk_write(ops, ordered=False)
def dump_cert(self, cert_items):
if cert_items:
asn = OpenSSL.crypto.dump_certificate(
OpenSSL.crypto.FILETYPE_TEXT, cert_items[0]
)
return b64encode(asn).decode('latin')
else:
return None