from lxml import etree import re codecs nltk f1 codecs open media1 xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from lxml import etree
import re, codecs, nltk
f1 = codecs.open('media1.xml', 'r', 'utf-8')
doc = etree.parse('disamb-1.xml')
txt1 = f1.read().replace(u'=', u',').replace(u'S-PRO', u'SPRO').replace(u'A-PRO', u'APRO').replace('ADV-PRO', 'ADVPRO')
m = re.findall(u'gr,"([A-Za-z0-9,.-]*)">', txt1, flags = re.U)
for i in m:
n = i.split(u',')
s = sorted(n)
gr = u','.join(s)
bg1 = nltk.bigrams(gr)
fd = nltk.FreqDist(bg1)
spisok = doc.findall('.//w')
arr = []
for i in spisok:
k = i.getchildren()
l = [child.get('gr') for child in k]
arr.append(l)
n = 0
while n + 1 < len(arr):
previous = arr[n][0]
current = arr[n + 1]
freqd = [(previous, i) for i in current]
chisla = [fd[r] for r in freqd]
idx = chisla.index(max(chisla))
arr[n + 1] = [arr[n + 1][idx]]
n += 1
print arr
f1.close()