usr bin python coding utf-8 import codecs from sklearn feature_extract

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/python
# coding: utf-8
import codecs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
def get_data(menu_file):
data = []
for i in xrange(2*len(menu_file)/3):
line = menu_file[i].split("\t")
if len(line) <= 3: continue
num, kitchen, menu = line[0], line[1], line[2:]
if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue
data.append((kitchen, u' '.join(menu)))
return data
def suggest(menu, clf, vectorizer):
return clf.predict(vectorizer.transform([menu]).toarray())[0]
def test(menu_file, clf, vectorizer):
for i in xrange(2*len(menu_file)/3, len(menu_file)):
line = menu_file[i].split("\t")
if len(line) <= 3: continue
num, kitchen, menu = line[0], line[1], line[2:]
if len(num) > 6 or len(menu) <= 1 or kitchen == "" or len(kitchen.split("/")) > 1: continue
print u"Кухня №%s - %s (на самом деле - %s)" % (num, suggest(u' '.join(menu), clf, vectorizer), kitchen)
def main():
menu_file = codecs.open('menu.txt', 'r', 'utf-8').readlines()
vectorizer = CountVectorizer(min_df=1)
corpus = get_data(menu_file)
y = [a for a,b in corpus]
X = vectorizer.fit_transform((b for a,b in corpus)).toarray()
clf = SVC()
clf.fit(X, y)
test(menu_file, clf, vectorizer)
main()