usr bin python coding utf-8 import codecs re import xml etree cElement

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python
# coding: utf-8
a = []
b = []
c = []
h = {}
import codecs, re
import xml.etree.cElementTree as etree
# === первая часть
#p = codecs.open('mini.xml', 'r', 'utf-8')
p = open('mini.xml')
for line in p:
g = re.search(r'gr=\"(.*?)\">', line)
if g != None:
a.append(g.group(1))
p.close()
a = [x.replace(u'=', u',') for x in a]
for i in range(len(a)-1):
b.append(a[i])
b.append(a[i+1])
if len(b) == 2:
c.append(b)
b = []
for i,j in c:
k = i + u' ' + j
if h.has_key(k):
h[k] += 1
else:
h[k] = 1
# === вторая часть
f = open('disamb.xml')
root = etree.parse(f)
f.close()
html = etree.Element('html')
body = etree.SubElement(html, 'body')
for p in root.find('body').findall('p'):
p1 = etree.SubElement(body, 'p')
for se in p.findall('se'):
se1 = etree.SubElement(body, 'se')
se1.text = se.text
# смотрим на первое слово
anas = se.findall('w')[0].findall('ana')
count = []
for ana in anas:
cnt = 0
for key in h.keys():
if key.count(ana.get('gr')):
if key.index(ana.get('gr')):
cnt += 1
count.append(cnt)
w1 = etree.SubElement(se, 'w')
w1.text = se.findall('w')[0]
mx = -1
mx_ind = -1
for i in xrange(len(count)):
if count[i] > mx:
mx_ind = i
mx = count[i]
best_ana = anas[mx_ind]
ana1 = etree.SubElement(w, 'ana', lex=best_ana.get('lex'), gr=best_ana.get('gr'))
# осталось найти для каждого последующего <w>