Python
23 Aug 2010
 
 
 
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gevent
import socket
import urllib2
from Queue import Queue, Empty
import logging
import time
import threading
from urlparse import urlsplit
import urllib
import os
socket.setdefaulttimeout(5)
THREAD_LIMIT = 100
def worker(url, results):
try:
info = urlsplit(url)
logging.debug('Fetching %s' % url)
data = urllib.urlopen(url).read()
tname = threading.currentThread().name
results.put(('%s-%s' % (info.hostname, tname), data))
except Exception, ex:
logging.error('', exc_info=ex)
def clean_dumps():
for fname in os.listdir('dumps'):
os.unlink('dumps/%s' % fname)
def main():
threads = []
results = Queue()
tasks = open('urls.txt').read().splitlines()
for url in tasks:
t = threading.Thread(target=worker, args=[url, results])
t.start()
threads.append(t)
while True:
try:
res = results.get_nowait()
logging.debug('Received result')
open('dumps/' + res[0], 'w').write(res[1])
except Empty:
if not len([x for x in threads if x.isAlive()]):
break
time.sleep(0.1)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG, format='%(threadName)s %(message)s')
handler = logging.FileHandler('logs/session.log', 'w')
logging.getLogger().addHandler(handler)
clean_dumps()
start = time.time()
main()
logging.debug('Work done in %.2f seconds' % (time.time() - start))
logging.debug('Results: %d' % len(os.listdir('dumps')))