import re ALL_LINKS_FILE tmp links txt MIXED_LINKS_FILE tmp result txt

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
ALL_LINKS_FILE = '/tmp/links.txt'
MIXED_LINKS_FILE = '/tmp/result.txt'
RE_SPECIAL_LINK = re.compile(r'recaptcha', re.I)
def main():
print 'Search for all special links'
special_links = []
total_count = 0
for line in open(ALL_LINKS_FILE):
total_count += 1
if RE_SPECIAL_LINK.search(line):
special_links.append(line)
special_count = len(special_links)
print 'Total links: %d, special links: %d' % (total_count, special_count)
ratio = round(total_count / special_count)
count = 0
with open(MIXED_LINKS_FILE, 'w') as out:
for line in open(ALL_LINKS_FILE):
if RE_SPECIAL_LINK.search(line):
pass
else:
count += 1
if count and count % ratio == 0:
if special_links:
out.write(special_links.pop())
count += 1
out.write(line)
else:
out.write(line)
if special_links:
out.write('\n'.join(special_links))
if __name__ == '__main__':
main()