from lxml import etree import re, codecs, nltk f1 = codecs.open('media1.xml', 'r', 'utf-8') doc = etree.parse('disamb-1.xml') txt1 = f1.read().replace(u'=', u',').replace(u'S-PRO', u'SPRO').replace(u'A-PRO', u'APRO').replace('ADV-PRO', 'ADVPRO') m = re.findall(u'gr,"([A-Za-z0-9,.-]*)">', txt1, flags = re.U) for i in m: n = i.split(u',') s = sorted(n) gr = u','.join(s) bg1 = nltk.bigrams(gr) fd = nltk.FreqDist(bg1) spisok = doc.findall('.//w') arr = [] for i in spisok: k = i.getchildren() l = [child.get('gr') for child in k] arr.append(l) n = 0 while n + 1 < len(arr): previous = arr[n][0] current = arr[n + 1] freqd = [(previous, i) for i in current] chisla = [fd[r] for r in freqd] idx = chisla.index(max(chisla)) arr[n + 1] = [arr[n + 1][idx]] n += 1 print arr f1.close()