#!/usr/bin/python # coding: utf-8 import codecs from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import SVC def get_data(menu_file): data = [] for i in xrange(2*len(menu_file)/3): line = menu_file[i].split("\t") if len(line) <= 3: continue num, kitchen, menu = line[0], line[1], line[2:] if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue data.append((kitchen, u' '.join(menu))) return data def suggest(menu, clf, vectorizer): return clf.predict(vectorizer.transform([menu]).toarray())[0] def test(menu_file, clf, vectorizer): for i in xrange(2*len(menu_file)/3, len(menu_file)): line = menu_file[i].split("\t") if len(line) <= 3: continue num, kitchen, menu = line[0], line[1], line[2:] if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue print u"Кухня №%s - %s (на самом деле - %s)" % (num, suggest(u' '.join(menu), clf, vectorizer), kitchen) def main(): menu_file = codecs.open('menu.txt', 'r', 'utf-8').readlines() vectorizer = CountVectorizer(min_df=1) corpus = get_data(menu_file) y = [a for a,b in corpus] X = vectorizer.fit_transform((b for a,b in corpus)).toarray() clf = SVC() clf.fit(X, y) test(menu_file, clf, vectorizer) main()