usr bin python coding utf-8 import codecs from sklearn import svm def

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python
# coding: utf-8
import codecs
from sklearn import svm
def generate_profiles(list_of_documents):
list_of_files = [codecs.open('Korpusa/%s' % x, 'r') for x in list_of_documents]
list_of_profiles = []
for f in list_of_files:
print f
text = f.read()
profile = {}
for i in xrange(L-N+1):
if (i % 10000) == 0: print "i = %s" % i
gram = text[i:i+N].lower()
if gram in profile.keys():
profile[gram] += 1
else:
profile[gram] = 1
list_of_profiles.append(profile)
return list_of_profiles
def merge_several_profiles(list_of_profiles):
keys = set()
for profile in list_of_profiles:
keys.update(profile.keys())
list_of_merged_profiles = []
for profile in list_of_profiles:
list_of_merged_profiles.append(profile)
for key in keys:
if key not in profile.keys():
profile[key] = 0
return list_of_merged_profiles
def return_vectors(list_of_merged_profiles):
list_of_items = [sorted(x.items()) for x in list_of_merged_profiles]
vectors = []
for items in list_of_items:
# item = ((..., ...), (..., ...), (..., ...), ...)
vector = tuple(x[1] for x in items)
vectors.append(vector)
return vectors
def make_vectors(list_of_files, list_of_examples):
profiles = generate_profiles(list_of_files+list_of_examples)
merged = merged = merge_several_profiles(profiles)
vectors = return_vectors(merged)
n = len(list_of_examples)
return vectors[:-n], vectors[-n:]
def check(clf, example, author):
if clf.predict([example])[0] == author:
return u"да"
else:
return u"нет"
N = 4
L = 30000
tolstoy_files = ['AnnaKarenina.txt', 'DvaGusara.txt', 'Detstvo.txt', 'Otrochestvo.txt', 'Unost.txt']
tolstoy_example = 'VoynaIMir.txt'
pushkin_files = ['Dubrovsky.txt', 'IstoriyaSelaGorukhino.txt', 'PikovayaDama.txt', 'IstoriyaPugacheva.txt', 'EgipetskiyeNochi.txt']
pushkin_example = 'KapitanskayaDochka.txt'
lermontoff_files = ['PanoramaMoskvy.txt', 'KnyaginyaLigovskaya.txt', 'Kavkazec.txt', 'Shtoss.txt', 'Vadim.txt']
lermontoff_example = 'GeroyNashegoVremeni.txt'
vectors, examples = make_vectors(tolstoy_files + pushkin_files + lermontoff_files, [tolstoy_example, pushkin_example, lermontoff_example])
clf = svm.SVC()
classes = [u'Толстой']*5 + [u'Пушкин']*5 + [u'Лермонтов']*5
clf.fit(vectors, classes)
print u'Является ли автором "Войны и мира" Пушкин? Ответ - %s.' % check(clf, examples[0], u"Пушкин")
print u'Является ли автором "Капитанской дочки" Пушкин? Ответ - %s.' % check(clf, examples[1], u"Пушкин")
print u'Является ли автором "Героя нашего времени" Лермонтов? Ответ - %s.' % check(clf, examples[2], u"Лермонтов")