#!/usr/bin/env python
import os
import sys
dir = os.path.dirname(os.path.abspath(__file__ + '/../..'))
sys.path.insert(0, dir)
os.chdir(dir)
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
import threading
from datetime import datetime, timedelta
import time
import random
import itertools
from feed_grabber.settings import WORKER_COUNT, TIME_TOUCHED_TIMEOUT, USE_PROXY, MAX_TASK_COUNT
from feed_grabber.models import Task, Result
from libpy.parser.yahoo_news import YahooNews
from libpy.parser.digg import Digg
from libpy.threads import Locker
from libpy.test_lock import test_lock
if test_lock('var/feed_grabber.lock'):
print 'Script already running'
sys.exit(1)
proxy_locker = Locker()
class ProxyManager(object):
proxy_list = [
#proxy list here
]
def __init__(self):
self.generator = itertools.cycle(self.proxy_list)
@proxy_locker
def get(self):
return self.generator.next()
proxy_manager = ProxyManager()
manager_locker = Locker()
class Manager(object):
count = 0
@manager_locker
def get_task(self):
self.count += 1
if self.count > MAX_TASK_COUNT:
print 'Max task count reached'
return None
try:
task = Task.objects.unchecked()[0]
except IndexError:
print 'no'
task = None
else:
task.time_touched = datetime.now()
task.save()
return task
@manager_locker
def save_task(self, task):
task.time_checked = datetime.now()
task.save()
class Worker(threading.Thread):
def process_source(self, task, source, ParserClass, *args, **kwargs):
if USE_PROXY:
proxy = proxy_manager.get()
else:
proxy = None
print '%s: source: %s, processing: %s' % (self.getName(), source, task.query)
resp = ParserClass(proxy=proxy, *args, **kwargs).get(task.query)
if resp.success:
for entry in resp.result['entries']:
if not Result.objects.filter(task=task, source=source,
guid=entry['guid']).count():
#print 'link: %s' % entry['link']
Result(task=task,
source=source,
title=entry['title'],
link=entry['link'],
summary=entry['summary'],
content=entry['content'],
time_created=entry['time_created'],
guid=entry['guid']).save()
else:
print 'source: %s, error: %s' % (source, resp.error)
#error = Error(feed_grabber=feed_grabber,
#message=resp.error,
#property=property,
#body=resp.body,
#url=resp.url)
#error.save()
def run(self):
while True:
task = manager.get_task()
if task:
self.process_source(task, 'yahoo_news', YahooNews)
self.process_source(task, 'digg', Digg)
manager.save_task(task)
time.sleep(2)
else:
break
manager = Manager()
workers = []
for x in xrange(WORKER_COUNT):
worker = Worker()
worker.start()
workers.append(worker)
for worker in workers:
worker.join()
print 'done'