usr bin env python import os import sys dir os path dirname os path ab

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
import os
import sys
dir = os.path.dirname(os.path.abspath(__file__ + '/../..'))
sys.path.insert(0, dir)
os.chdir(dir)
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
import threading
from datetime import datetime, timedelta
import time
import random
import itertools
from feed_grabber.settings import WORKER_COUNT, TIME_TOUCHED_TIMEOUT, USE_PROXY, MAX_TASK_COUNT
from feed_grabber.models import Task, Result
from libpy.parser.yahoo_news import YahooNews
from libpy.parser.digg import Digg
from libpy.threads import Locker
from libpy.test_lock import test_lock
if test_lock('var/feed_grabber.lock'):
print 'Script already running'
sys.exit(1)
proxy_locker = Locker()
class ProxyManager(object):
proxy_list = [
#proxy list here
]
def __init__(self):
self.generator = itertools.cycle(self.proxy_list)
@proxy_locker
def get(self):
return self.generator.next()
proxy_manager = ProxyManager()
manager_locker = Locker()
class Manager(object):
count = 0
@manager_locker
def get_task(self):
self.count += 1
if self.count > MAX_TASK_COUNT:
print 'Max task count reached'
return None
try:
task = Task.objects.unchecked()[0]
except IndexError:
print 'no'
task = None
else:
task.time_touched = datetime.now()
task.save()
return task
@manager_locker
def save_task(self, task):
task.time_checked = datetime.now()
task.save()
class Worker(threading.Thread):
def process_source(self, task, source, ParserClass, *args, **kwargs):
if USE_PROXY:
proxy = proxy_manager.get()
else:
proxy = None
print '%s: source: %s, processing: %s' % (self.getName(), source, task.query)
resp = ParserClass(proxy=proxy, *args, **kwargs).get(task.query)
if resp.success:
for entry in resp.result['entries']:
if not Result.objects.filter(task=task, source=source,
guid=entry['guid']).count():
#print 'link: %s' % entry['link']
Result(task=task,
source=source,
title=entry['title'],
link=entry['link'],
summary=entry['summary'],
content=entry['content'],
time_created=entry['time_created'],
guid=entry['guid']).save()
else:
print 'source: %s, error: %s' % (source, resp.error)
#error = Error(feed_grabber=feed_grabber,
#message=resp.error,
#property=property,
#body=resp.body,
#url=resp.url)
#error.save()
def run(self):
while True:
task = manager.get_task()
if task:
self.process_source(task, 'yahoo_news', YahooNews)
self.process_source(task, 'digg', Digg)
manager.save_task(task)
time.sleep(2)
else:
break
manager = Manager()
workers = []
for x in xrange(WORKER_COUNT):
worker = Worker()
worker.start()
workers.append(worker)
for worker in workers:
worker.join()
print 'done'