import time import json from datetime import datetime from copy import

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import time
import json
from datetime import datetime
from copy import copy
from socket import gethostbyname
from pymongo import UpdateOne
from ioweb import Crawler, Request, CallbackRequest
from ioweb.error import DataNotValid
from project.database import db
with open('var/config.json') as inp:
CONFIG = json.load(inp)
class DomainResolveCrawler(Crawler):
dataop_threshold = {
'domain': {
'number': 500,
'size': None,
#'time': None,
},
}
dataop_threshold = {}
def task_generator(self):
recent_pk = None
while True:
query = {'_status_resolve': 'new'}
if recent_pk:
query['_id'] = {'$gt': recent_pk}
chunk_size = 1000
domains = list(db.domain.find(
query, limit=chunk_size, sort=[('_id', 1)]
))
for domain in domains:
yield CallbackRequest(
name='resolve',
network_callback=self.resolve_callback,
retry_errors=(OSError,),
meta={
'domain': domain['_id'],
},
)
recent_pk = domain['_id']
#time.sleep(self.delay)
if not nets:
break
def resolve_callback(self, req, res):
ip = gethostbyname(req.meta['domain'])
print(req.meta['domain'], ip)
res.meta['ip'] = ip
def handler_resolve(self, req, res):
self.stat.inc('domain-ok')
op = UpdateOne(
{'_id': req.meta['domain']},
{'$set': {
'_status_resolve': 'ok',
'_date_resolve': datetime.utcnow(),
'data': {
'ip': res.meta['ip'],
},
}},
)
self.enq_dataop('domain', op)
def rejected_resolve(self, req, res):
self.stat.inc('domain-ok')
op = UpdateOne(
{'_id': req.meta['domain']},
{'$set': {
'_status_resolve': 'fail',
'_date_resolve': datetime.utcnow(),
'_error_resolve': str(res.error),
'data': {
'ip': res.meta['ip'],
},
}}
)
self.enq_dataop('domain', op)
def dataop_handler_domain(self, ops):
db.domain.bulk_write(ops, ordered=False)