from pandas import read_table
import math
import numpy as np
from collections import defaultdict
def H(*values):
values = np.asarray(values) * 1.0
values = values / sum(values)
return sum((-value * math.log(value, 2) for value in values if value > 0.0))
def calc_pn(p, n, P, N):
return H(P, N) - (p + n) * 1.0 / (P + N) * H(p, n) - (P + N - p - n) * 1.0 / (P + N) * H(P - p, N - n),
def read_csv():
return read_table('./train.csv', header=0, sep=r"\s*", index_col=0)
df = read_csv()
P, N = dict(df.groupby(['Class'])['Class'].count()).values()
gain = {}
for column in df.columns.values[:-1]:
features = defaultdict(lambda: 0)
features.update(dict(df.groupby([column, 'Class'])['Class'].count()))
val = set([x[0] for x in features.keys()])
sub_gain = []
for x in val:
sub_gain.append(calc_pn(features[(x, 'Yes')], features[(x, 'No')], P, N))
gain[column] = max(sub_gain)
print gain.items()