from pprint import pprint
from urllib.parse import quote_plus, urlsplit, urljoin
import logging
from datetime import datetime
from ioweb import Crawler, Request
from ioweb.error import DataNotValid
from pymongo import UpdateOne
from project.database import db
from project.settings import VK_TOKENS_FILE
from vk.api import (
Api, RemoteApiError, TooManyRequestsError, AccessDeniedError,
PrivateProfileError
)
from vk.config import load_api_tokens
from vk.fields import USER_FIELDS
from project import settings
class ProfilePhotoCrawler(Crawler):
dataop_threshold = {
'profile_photo': {
'number': 500,
},
}
def init_hook(self):
self.api = Api(token=load_api_tokens(VK_TOKENS_FILE))
self.stat.speed_keys = ['user-item', 'user-api-response']
self.load_proxylist(
'file', settings.PROXY_LIST_FILE, proxy_type='socks5'
)
def setup_request_hook(self, transport, req):
req.setup(
connect_timeout=10,
timeout=30,
)
def handler_photos_get(self, req, res):
if (not res.status or
not res.headers['content-type'].startswith('application/json')
):
self.stat.inc('error-api-not-json')
raise DataNotValid
try:
api_res = self.api.parse_response(res.json)
except TooManyRequestsError as ex:
self.stat.inc('error-api-too-many-req')
raise DataNotValid
except AccessDeniedError as ex:
self.stat.inc('ok-api-access-denied')
update = {
'_status_profile_photo': 'error-access-denied',
'_date_profile_photo': datetime.utcnow(),
'data_profile_photo': None,
'error_profile_photo': str(ex),
}
except PrivateProfileError as ex:
self.stat.inc('ok-api-private-profile')
update = {
'_status_profile_photo': 'error-private-profile',
'_date_profile_photo': datetime.utcnow(),
'data_profile_photo': None,
'error_profile_photo': str(ex),
}
else:
self.stat.inc('ok-api-photos-get')
update = {
'_status_profile_photo': 'ok',
'_date_profile_photo': datetime.utcnow(),
'data_profile_photo': api_res,
}
self.enq_dataop('profile_photo', UpdateOne(
{'_id': req.meta['uid']},
{'$set': update},
))
def dataop_handler_profile_photo(self, ops):
db.user.bulk_write(ops, ordered=False)
def iterate_ids_chunks(self, chunk_size):
while True:
query = {
'name': 'profile_photo',
'status': 'new',
}
task_chunk = db.task_chunk.find_one_and_update(
query,
{
'$set': {
'status_date': datetime.utcnow(),
'status': 'active',
},
},
sort=[('id_start', 1)],
)
if not task_chunk:
break
else:
self.stat.inc('task-chunk-%s' % task_chunk['name'])
ids = list(db.user.find({
'_id': {
'$gte': task_chunk['id_start'],
'$lte': task_chunk['id_end'],
},
'_status_profile_photo': {'$in': [None, 'new']},
}))
for idx in range(0, len(ids), chunk_size):
yield [x['_id'] for x in ids[idx : idx + chunk_size]]
def build_request_data(self, uid):
return self.api.build_request(
'photos.get',
owner_id=uid,
album_id='profile',
)
def task_generator(self):
for ids_chunk in self.iterate_ids_chunks(10):
for uid in ids_chunk:
url, data = self.build_request_data(uid)
yield Request(
name='photos_get',
url=url,
data=data,
meta={'uid': uid}
)
break