#!/usr/bin/python
# coding: utf-8
import codecs
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
def make_corpus():
corpus = []
for i in xrange(2*num_lines/3):
rest = menu.readline().strip()
rest_spl = rest.split("\t")
if len(rest_spl) > 3:
num, typ = rest_spl[0], rest_spl[1]
if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0:
words = u' '.join(rest_spl[2:])
corpus.append((typ, words))
return corpus
def make_assumption(words):
fvect = vectorizer.transform([words]).toarray()
return clf.predict(fvect)[0]
def test():
for i in xrange(2*num_lines/3, num_lines):
rest = menu.readline().strip()
rest_spl = rest.split("\t")
if len(rest_spl) > 3:
num, typ = rest_spl[0], rest_spl[1]
if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0:
words = u' '.join(rest_spl[2:])
print "Кухня ", typ
print "Предположение программы -", make_assumption(words)
print
num_lines = sum(1 for line in open('menu.txt'))
menu = codecs.open('menu.txt', 'r', 'utf-8')
vectorizer = CountVectorizer(min_df=1)
corpus = make_corpus()
y = [a for a,b in corpus]
X = vectorizer.fit_transform((b for a,b in corpus)).toarray()
clf = svm.SVC()
clf.fit(X, y)
test()