usr bin env python coding windows-1251 import os import re import quop

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python
# -*- coding: windows-1251 -*-
import os
import re
import quopri
files = [x for x in os.walk(".")][0][2]
out = open("extract.txt",'wb')
for fn in files:
f = open(fn, "rb")
raw_text = f.read()
dec_text = quopri.decodestring(raw_text) if os.path.splitext(fn)[1]==".mht" else raw_text
dec_text = dec_text.replace(os.linesep,"") #поубирать \n, чтоб работать без re.MULTILINE
dec_text = "".join(s.strip()+" " for s in dec_text.split()) #почистить лишние пробелы
re1 = re.compile(r"""<TD\s{1,1}vAlign="?top"?>(\d+?)[.]</TD>""",re.I)
re2 = re.compile(r"""\s*?(\d+?)</NOBR>""",re.I)
re3 = re.compile(r"""<A\s*?href="[^"]+?"><FONT\s*?color="?#?000066"?>(.*?)</FONT></A>""",re.I)
re4 = re.compile(r"""<FONT\s*?color="?#?000066"?><NOBR>[(](.*?)[)]</NOBR></FONT></A>""",re.I)
ordinals = [x.group(1) for x in re1.finditer(dec_text)]
numbers = [x.group(1) for x in re2.finditer(dec_text)]
titles = [x.group(1) for x in re3.finditer(dec_text)]
dates = [x.group(1) for x in re4.finditer(dec_text)]
all = zip(ordinals, numbers, titles, dates)
for x in all:
text = "%s. №%s: %s (%s)\n" % x
out.write(text)
out.write("\n")
out.close()