def fetch pool func conn 500 num_processed num_pool len pool ret conn

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def fetch(pool,func,conn = 500):
num_processed = 0
num_pool = len(pool)
ret=[]
conn = min(num_pool,conn)
m = CurlMulti()
m.handles = []
for i in range(conn):
c = Curl()
c.setopt(FOLLOWLOCATION, 1)
c.setopt(MAXREDIRS, 2)
c.setopt(NOSIGNAL, 0)
c.setopt(CONNECTTIMEOUT, 3)
c.setopt(TIMEOUT, 5)
c.setopt(HTTPHEADER, ['User-Agent: Mozilla/5.0 (rv:1.8.1.12) Gecko/20080129 Firefox/2.0.0.14 Windows; U; Windows NT 6.0; en-US; rv:1.8.1.14'])
m.handles.append(c)
worker = m.handles[:]
while num_pool > num_processed:
while worker and pool:
url = pool.pop()
while url in pool:
pool.remove(url)
num_pool=num_pool-1
c= worker.pop()
c.res = cStringIO.StringIO()
c.setopt(URL,url)
c.setopt(WRITEFUNCTION,c.res.write)
m.add_handle(c)
c.url=url
while 1:
ret,num_h = m.perform()
if ret != E_CALL_MULTI_PERFORM: break
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
m.remove_handle(c)
c.data=c.res.getvalue()
worker.append(c)
print c.url
break
for mask in func:
if re.findall(mask[1],c.url):
mask[0](c)
for c, errno, errmsg in err_list:
m.remove_handle(c)
c.data=c.res.getvalue()
worker.append(c)
num_processed = num_processed + len(ok_list)
if num_q == 0:
break
try:
m.select(1.0)
except:
pass
return