#!/usr/bin/python2 # coding: utf-8 import codecs from lxml import etree from copy import deepcopy from random import choice filename = "alutor.lift" TOTAL_SIZE = float(raw_input("Input amount of MB: ")) * 1000000 letters = u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-' letters += u'абсгдеёжзийклмнопрстуфхцчшщъыьЭЮЯАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ' letters += u'jəɣŋ' def morph_word(word): l = [] for i in xrange(len(word)): if word[i] in letters: l.append(i) if 0 in l: l.remove(0) if l: for k in xrange(3): i = choice(l) j = choice(l) if i > j: i, j = j, i if i != j: word = word[:i] + word[j] + word[i+1:j] + word[i] + word[j+1:] return word def morph_sentence(text): new_text = '' for word in text.split(' '): new_text += morph_word(word) + ' ' return new_text.strip() f = codecs.open(filename, 'r', 'utf-8') tree = etree.parse(f) f.close() old_lift = tree.getroot() for i in xrange(int(TOTAL_SIZE // len(etree.tostring(old_lift, encoding='unicode')))): lift = deepcopy( old_lift ) header = lift.find('header') fields = header.find('fields') for field in fields.findall('field'): form = field.find('form') text = form.find('text') if text.text: text.text = morph_sentence(text.text) for entry in lift.findall('entry'): for el in entry.getchildren(): if el.tag == 'sense': for el1 in el.getchildren(): text = el1.find('text') if text is not None: # если имеется тег if text.text: # если этот тег непустой text.text = morph_sentence(text.text) for form in el1.findall('form'): text1 = form.find('text') if text1 is not None: # если имеется тег if text1.text: # если этот тег непустой text1.text = morph_sentence(text1.text) else: form = el.find('form') if form is not None: # если имеется тег
text = form.find('text') if text is not None: # если имеется тег if text.text: # если этот тег непустой text.text = morph_sentence(text.text) f = codecs.open('new_%03i_%s'%(i+1, filename), 'w', 'utf-8') f.write(etree.tostring(lift, encoding='unicode')) f.close()