import urllib import urllib2 import socket from Queue import Queue imp

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import urllib
import urllib2
import socket
from Queue import Queue
import time
import re
import threading
def find_key(key, data, unique=True):
values = []
for x in data.splitlines():
if ':' in x:
iter_key, value = x.strip().split(':', 1)
if iter_key == key:
values.append(value)
return values[0] if unique else values
def find_value(value, data, unique=False):
keys = []
for x in data.splitlines():
if ':' in x:
key, iter_value = x.strip().split(':', 1)
if iter_value == value:
keys.append(key)
return keys[0] if unique else keys
def check_task(item):
proxy_handler = urllib2.ProxyHandler({'http': 'http://' + item})
opener = urllib2.build_opener(proxy_handler)
post_key = str(time.time()).replace('.', '-')
post_value = str(time.time()).replace('.', '-')
post_data = urllib.urlencode({post_key: post_value})
start_time = time.time()
try:
data = opener.open(check_url, post_data).read(3000)
except Exception, ex:
result = item, 'BAD', 'Timeout'
else:
timeout = '%.2f' % (time.time() - start_time)
if len(data) < 200 or len(data) > 2000:
result = item, 'BAD', 'Size'
if not data.startswith('BEGIN') or not data.endswith('END'):
result = item, 'BAD', 'Response'
keys = find_value('%s:%s' % (post_key, post_value), data)
if keys != ['POST']:
result = item, 'BAD', 'POST error'
else:
keys = find_value(ip, data)
if keys:
result = item, 'TRANSPARENT', timeout, ','.join(keys)
else:
values = find_key('HTTP_VIA', data, False) +\
find_key('HTTP_CACHE_CONTROL', data, False)
if values:
result = item, 'ANON', timeout
else:
result = item, 'ELITE', timeout
print ' '.join(result)
#q.put(result)
def find_proxy(url):
try:
data = urllib.urlopen(url).read()
except Exception:
count = 0
else:
count = 0
for proxy in RE_PROXY.findall(data):
count += 1
proxy_queue.put(proxy)
print 'PROXY LIST', url, count
def get_grab_urls():
items = []
items.append('http://www.samair.ru/proxy/')
for x in xrange(1, 5):
items.append('http://www.samair.ru/proxy/proxy-%02d.htm' % x)
items.append('http://www.checker.freeproxy.ru/checker/last_checked_proxies.php')
for x in xrange(0, 5):
items.append('http://tools.rosinstrument.com/raw_free_db.htm?%d' % x)
return items
def make_work(callback, tasks, limit):
while len(tasks):
add = False
if not pool:
add = True
else:
for t in pool[:]:
if not t.isAlive():
pool.remove(t)
add = True
break
if not add:
if len(pool) < limit:
add = True
if add:
if not len(pool) > limit:
task = tasks.pop()
if not isinstance(task, (list, tuple)):
task = [task]
t = threading.Thread(target=callback, args=task)
t.start()
pool.append(t)
time.sleep(0.1)
for t in pool:
t.join()
RE_PROXY = re.compile(r'([a-zA-Z0-9.]{5,30}:\d{1,4})')
socket.setdefaulttimeout(5)
proxy_queue = Queue()
#urls = [x.strip() for x in file('grab.list') if 'http' in x]
urls = get_grab_urls()
pool = []
make_work(find_proxy, urls, 5)
items = []
while not proxy_queue.empty():
items.append(proxy_queue.get())
check_url = 'http://it-omsk.com/check.php'
data = urllib.urlopen(check_url).read()
ip = find_key('REMOTE_ADDR', data)
socket.setdefaulttimeout(5)
make_work(check_task, items, 50)