#!/usr/bin/python # coding: utf-8 import codecs from sklearn import svm def generate_profiles(list_of_documents): list_of_files = [codecs.open('Korpusa/%s' % x, 'r') for x in list_of_documents] list_of_profiles = [] for f in list_of_files: print f text = f.read() profile = {} for i in xrange(L-N+1): if (i % 10000) == 0: print "i = %s" % i gram = text[i:i+N].lower() if gram in profile.keys(): profile[gram] += 1 else: profile[gram] = 1 list_of_profiles.append(profile) return list_of_profiles def merge_several_profiles(list_of_profiles): keys = set() for profile in list_of_profiles: keys.update(profile.keys()) list_of_merged_profiles = [] for profile in list_of_profiles: list_of_merged_profiles.append(profile) for key in keys: if key not in profile.keys(): profile[key] = 0 return list_of_merged_profiles def return_vectors(list_of_merged_profiles): list_of_items = [sorted(x.items()) for x in list_of_merged_profiles] vectors = [] for items in list_of_items: # item = ((..., ...), (..., ...), (..., ...), ...) vector = tuple(x[1] for x in items) vectors.append(vector) return vectors def make_vectors(list_of_files, list_of_examples): profiles = generate_profiles(list_of_files+list_of_examples) merged = merged = merge_several_profiles(profiles) vectors = return_vectors(merged) n = len(list_of_examples) return vectors[:-n], vectors[-n:] def check(clf, example, author): if clf.predict([example])[0] == author: return u"да" else: return u"нет" N = 4 L = 30000 tolstoy_files = ['AnnaKarenina.txt', 'DvaGusara.txt', 'Detstvo.txt', 'Otrochestvo.txt', 'Unost.txt'] tolstoy_example = 'VoynaIMir.txt' pushkin_files = ['Dubrovsky.txt', 'IstoriyaSelaGorukhino.txt', 'PikovayaDama.txt', 'IstoriyaPugacheva.txt', 'EgipetskiyeNochi.txt'] pushkin_example = 'KapitanskayaDochka.txt' lermontoff_files = ['PanoramaMoskvy.txt', 'KnyaginyaLigovskaya.txt', 'Kavkazec.txt', 'Shtoss.txt', 'Vadim.txt'] lermontoff_example = 'GeroyNashegoVremeni.txt' vectors, examples = make_vectors(tolstoy_files + pushkin_files + lermontoff_files, [tolstoy_example, pushkin_example, lermontoff_example]) clf = svm.SVC() classes = [u'Толстой']*5 + [u'Пушкин']*5 + [u'Лермонтов']*5 clf.fit(vectors, classes) print u'Является ли автором "Войны и мира" Пушкин? Ответ - %s.' % check(clf, examples[0], u"Пушкин") print u'Является ли автором "Капитанской дочки" Пушкин? Ответ - %s.' % check(clf, examples[1], u"Пушкин") print u'Является ли автором "Героя нашего времени" Лермонтов? Ответ - %s.' % check(clf, examples[2], u"Лермонтов")