#!/usr/bin/python # coding: utf-8 import codecs from sklearn import svm from sklearn.feature_extraction.text import CountVectorizer def make_corpus(): corpus = [] for i in xrange(2*num_lines/3): rest = menu.readline().strip() rest_spl = rest.split("\t") if len(rest_spl) > 3: num, typ = rest_spl[0], rest_spl[1] if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0: words = u' '.join(rest_spl[2:]) corpus.append((typ, words)) return corpus def make_assumption(words): fvect = vectorizer.transform([words]).toarray() return clf.predict(fvect)[0] def test(): for i in xrange(2*num_lines/3, num_lines): rest = menu.readline().strip() rest_spl = rest.split("\t") if len(rest_spl) > 3: num, typ = rest_spl[0], rest_spl[1] if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0: words = u' '.join(rest_spl[2:]) print "Кухня ", typ print "Предположение программы -", make_assumption(words) print num_lines = sum(1 for line in open('menu.txt')) menu = codecs.open('menu.txt', 'r', 'utf-8') vectorizer = CountVectorizer(min_df=1) corpus = make_corpus() y = [a for a,b in corpus] X = vectorizer.fit_transform((b for a,b in corpus)).toarray() clf = svm.SVC() clf.fit(X, y) test()