Kaggle etalon

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import math
import numpy as np
from multiprocessing import Pool
from sklearn.neighbors import KDTree
from sklearn.base import BaseEstimator, ClassifierMixin
class DigitClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, k):
self.k = k
def fit(self, data, label):
#print(data.shape, label.shape)
self.kd = KDTree(data)
self.labels = label
def predict(self, X):
M = X.shape[0]
res = np.zeros(M, dtype=np.int)
for i in range(M):
distances, label_indices = self.kd.query(X[i], self.k)
closest_labels = self.labels[label_indices]
uniqs, counts = np.unique(closest_labels, return_counts=True)
res[i] = uniqs[counts.argmax()]
return res
# Just for one-time purpose
#temp_train_base = np.loadtxt('train.csv', delimiter=',', skiprows=1, dtype=np.uint8)
#temp_train_data = temp_train_base[:, 1:]
#temp_train_labels = temp_train_base[:, 0]
#temp_test_base = np.loadtxt('test.csv', delimiter=',', skiprows=1, dtype=np.uint8)
#temp_train_data.tofile('raw_train_data')
#temp_train_labels.tofile('raw_train_labels')
#temp_test_base.tofile('raw_test_base')
train_data = np.fromfile('raw_train_data', dtype=np.uint8).reshape((-1, 784))
train_labels = np.fromfile('raw_train_labels', dtype=np.uint8)
test_data = np.fromfile('raw_test_base', dtype=np.uint8).reshape((-1, 784))
def build_and_train():
clf = DigitClassifier(5)
global train_data
global train_labels
clf.fit(train_data, train_labels)
return clf
def classify(clf, data):
return clf.predict(data)
def classify_wrapper(args):
return classify(*args)
proc_number = 4
# Make and fit proc_number classifiers
p = Pool(proc_number)
clfs = [res.get() for res in [p.apply_async(build_and_train) for _ in range(proc_number)]]
# And predict test data
test_data_parted = test_data.reshape((4, -1, 784))
res_list = p.map(classify_wrapper, zip(clfs, test_data_parted))
all_res = np.hstack(res_list)
with open("kaggle_result.csv", "w") as f:
f.write('ImageId,label\n')
for i, res in enumerate(all_res):
f.write(str(i+1) + ',' + str(res) + '\n')