usr bin python2 coding utf-8 import codecs from lxml import etree from

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python2
# coding: utf-8
import codecs
from lxml import etree
from copy import deepcopy
from random import choice
FILE_SIZE = 1 * 1000 * 1000 # 1MB - размер одного файла
TOTAL_SIZE = 14.5 * 1000 * 1000 # 10MB - размер всех файлов
FILENAME = "alut-txt-all.flextext"
letters = u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-'
letters += u'абсгдеёжзийклмнопрстуфхцчшщъыьЭЮЯАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
def morph_word(word):
l = []
for i in xrange(len(word)):
if word[i] in letters:
l.append(i)
if 0 in l:
l.remove(0)
if l:
for k in xrange(3):
i = choice(l)
j = choice(l)
if i > j: i, j = j, i
if i != j:
word = word[:i] + word[j] + word[i+1:j] + word[i] + word[j+1:]
return word
def morph_sentence(text):
new_text = ''
for word in text.split(' '):
new_text += morph_word(word) + ' '
return new_text.strip()
def tag_size(tag):
return len(etree.tostring(tag, encoding='unicode'))
f = codecs.open(FILENAME, 'r', 'utf-8')
tree = etree.parse(f)
f.close()
document = tree.getroot()
new_document = deepcopy(document)
current_document = etree.Element('document', version="2")
cur_total_size = 0
cur_number = 1
# копируем текст без изменений
for interlinear_text in document.findall('interlinear-text'):
if tag_size(current_document) + tag_size(interlinear_text) > FILE_SIZE:
f = codecs.open('new_%03i_%s'%(cur_number,FILENAME), 'w', 'utf-8')
f.write(etree.tostring(current_document, encoding='unicode'))
f.close()
cur_total_size += FILE_SIZE
cur_number += 1
current_document = etree.Element('document', version="2")
if cur_total_size + tag_size(current_document) + tag_size(interlinear_text) > TOTAL_SIZE:
break
new_interlinear_text = etree.SubElement(current_document, 'interlinear-text', guid=interlinear_text.get('guid'))
for ch in interlinear_text.getchildren():
new_interlinear_text.append( deepcopy(ch) )
# множим текст
flag = True
while flag:
for interlinear_text in document.findall('interlinear-text'):
if tag_size(current_document) + tag_size(interlinear_text) > FILE_SIZE:
f = codecs.open('new_%03i_%s'%(cur_number,FILENAME), 'w', 'utf-8')
f.write(etree.tostring(current_document, encoding='unicode'))
f.close()
cur_total_size += tag_size(current_document)
cur_number += 1
current_document = etree.Element('document', version="2")
if cur_total_size + tag_size(current_document) + tag_size(interlinear_text) > TOTAL_SIZE:
flag = False
break
# - создаём пустой <interlinear-text/>
new_interlinear_text = etree.SubElement(current_document, 'interlinear-text', guid=interlinear_text.get('guid'))
# - копируем в него всё из interlinear_text
for ch in interlinear_text.getchildren():
new_interlinear_text.append( deepcopy(ch) )
for title in new_interlinear_text.findall('item'):
if title.get('type') == 'title':
title.text = morph_sentence(title.text)
# изменяем
paragraphs = new_interlinear_text.find('paragraphs')
for paragraph in paragraphs.findall('paragraph'):
phrases = paragraph.find('phrases')
if phrases is None: continue
for phrase in phrases.findall('phrase'):
words = phrase.find('words')
for word in words:
item_txt = word.find('item')
morphemes = word.find('morphemes')
if morphemes is None: continue
for morph in morphemes.findall('morph'):
if morph.get('type') == 'stem':
for item in morph.findall('item'):
if item.text:
item.text = morph_sentence(item.text)
if item.get('lang') == 'alr-x-lat':
item_txt.text = item.text
for item in phrase.findall('item'):
if (item.get('type') == 'gls') and item.text:
item.text = morph_sentence(item.text)
# записываем "остатки" в новый файл
if current_document.find('interlinear-text') is not None:
f = codecs.open('new_%03i_%s'%(cur_number,FILENAME), 'w', 'utf-8')
f.write(etree.tostring(current_document, encoding='unicode'))
f.close()