data 31 Auto Rus Soft 25 Cel Rus Hard NM 18 None Ukr Soft 45 Health Ka

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
data=[['M',31,'Auto','Rus','Soft'],
['M',25,'Cel','Rus','Hard'],
['NM',18,'None','Ukr','Soft'],
['M',45,'Health','Kaz','Soft'],
['NM',23,'Auto','Ukr','Soft'],
['NM',45,'None','Rus','Hard'],
['M',31,'Auto','Kaz','Hard'],
['NM',27,'Cel','Rus','Soft'],
['M',39,'Health','Ukr','Soft'],
['NM',32,'Cel','Kaz','Soft'],
['NM',22,'None','Ukr','Hard'],
['NM',28,'None','Rus','Hard'],
['M',27,'Auto','Rus','Soft'],
['M',41,'Health','Kaz','Hard'],
['M',33,'Cel','Kaz','Soft'],
['NM',25,'None','Ukr','Soft'],
['NM',40,'Auto','Rus','Hard'],
['M',29,'Health','Ukr','Soft']]
#Soft: 11/18
for record in data:
if record[1]<30:
record[1]='y'
else:
record[1]='o'
def entropy(data,attr):
val_freq={}
data_entropy=0.0
for record in data:
if(val_freq.has_key(record[attr])):
val_freq[record[attr]]+=1.0
else:
val_freq[record[attr]]=1.0
from math import log
for freq in val_freq.values():
data_entropy+=(-freq/len(data))*log(freq/len(data),2)
return data_entropy
def gain(data, attr, target_attr):
val_freq = {}
subset_entropy = 0.0
for record in data:
if (val_freq.has_key(record[attr])):
val_freq[record[attr]] += 1.0
else:
val_freq[record[attr]] = 1.0
for val in val_freq.keys():
val_prob = val_freq[val] / sum(val_freq.values())
data_subset = [record for record in data if record[attr] == val]
subset_entropy += val_prob * entropy(data_subset, target_attr)
return (entropy(data, target_attr) - subset_entropy)
print gain(data,3,4)
#Country gain 0.0945518506747