usr bin python coding utf-8 import codecs from sklearn import svm from

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python
# coding: utf-8
import codecs
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
def make_corpus():
corpus = []
for i in xrange(2*num_lines/3):
rest = menu.readline().strip()
rest_spl = rest.split("\t")
if len(rest_spl) > 3:
num, typ = rest_spl[0], rest_spl[1]
if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0:
words = u' '.join(rest_spl[2:])
corpus.append((typ, words))
return corpus
def make_assumption(words):
fvect = vectorizer.transform([words]).toarray()
return clf.predict(fvect)[0]
def test():
for i in xrange(2*num_lines/3, num_lines):
rest = menu.readline().strip()
rest_spl = rest.split("\t")
if len(rest_spl) > 3:
num, typ = rest_spl[0], rest_spl[1]
if typ != "" and len(num) <= 6 and len(typ) > 0 and typ.count("/") == 0:
words = u' '.join(rest_spl[2:])
print "Кухня ", typ
print "Предположение программы -", make_assumption(words)
print
num_lines = sum(1 for line in open('menu.txt'))
menu = codecs.open('menu.txt', 'r', 'utf-8')
vectorizer = CountVectorizer(min_df=1)
corpus = make_corpus()
y = [a for a,b in corpus]
X = vectorizer.fit_transform((b for a,b in corpus)).toarray()
clf = svm.SVC()
clf.fit(X, y)
test()