#!/usr/bin/python
# coding: utf-8
import codecs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
def get_data(menu_file):
data = []
for i in xrange(2*len(menu_file)/3):
line = menu_file[i].split("\t")
if len(line) <= 3: continue
num, kitchen, menu = line[0], line[1], line[2:]
if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue
data.append((kitchen, u' '.join(menu)))
return data
def suggest(menu, clf, vectorizer):
return clf.predict(vectorizer.transform([menu]).toarray())[0]
def test(menu_file, clf, vectorizer):
for i in xrange(2*len(menu_file)/3, len(menu_file)):
line = menu_file[i].split("\t")
if len(line) <= 3: continue
num, kitchen, menu = line[0], line[1], line[2:]
if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue
print u"Кухня №%s - %s (на самом деле - %s)" % (num, suggest(u' '.join(menu), clf, vectorizer), kitchen)
def main():
menu_file = codecs.open('menu.txt', 'r', 'utf-8').readlines()
vectorizer = CountVectorizer(min_df=1)
corpus = get_data(menu_file)
y = [a for a,b in corpus]
X = vectorizer.fit_transform((b for a,b in corpus)).toarray()
clf = SVC()
clf.fit(X, y)
test(menu_file, clf, vectorizer)
main()