from pandas import read_table import math import numpy as np from coll

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from pandas import read_table
import math
import numpy as np
from collections import defaultdict
def H(*values):
values = np.asarray(values) * 1.0
values = values / sum(values)
return sum((-value * math.log(value, 2) for value in values if value > 0.0))
def calc_pn(p, n, P, N):
return H(P, N) - (p + n) * 1.0 / (P + N) * H(p, n) - (P + N - p - n) * 1.0 / (P + N) * H(P - p, N - n),
def read_csv():
return read_table('./train.csv', header=0, sep=r"\s*", index_col=0)
df = read_csv()
P, N = dict(df.groupby(['Class'])['Class'].count()).values()
gain = {}
for column in df.columns.values[:-1]:
features = defaultdict(lambda: 0)
features.update(dict(df.groupby([column, 'Class'])['Class'].count()))
val = set([x[0] for x in features.keys()])
sub_gain = []
for x in val:
sub_gain.append(calc_pn(features[(x, 'Yes')], features[(x, 'No')], P, N))
gain[column] = max(sub_gain)
print gain.items()