Levenshtein_distance

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
def distanceLev(a, b, \
add_cost=10, \
del_cost=10, \
change_cost=5, \
change_modifier=1, \
normalise = 0):
"Calculates the Levenshtein distance between a and b."
change_modifier = 2
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
current_row = range(n+1) # Keep current and previous row, not entire matrix
for i in xrange(1, m+1):
previous_row = current_row
current_row = [i]+[0]*m
for j in xrange(1,n+1):
add = previous_row[j] + add_cost
delete = current_row[j-1] + del_cost
change = previous_row[j-1]
if a[j-1] != b[i-1]:
change += comparer(a[j-1],b[i-1])*change_modifier + change_cost
current_row[j] = min(add, delete, change)
#for pseudo normalised version
summ = ( abs(normalise-1) or len(a)*len(b) )
return 1.*current_row[n]/summ
def comparer(a,b,diff=1,mod=1):
"""Returns difference of 'a' and 'b'
if (diff=0) or 'a' and 'b' are not numeric just checks if they are equal
'mod=1' causes function to return abs() of result"""
try:
if type(a) == type(b) == type('str'):
res = ord(a) - ord(b)
else:
res = (a-b)
except:
res = (a==b)
if mod:
try: res = abs(res)
except:
print a,b,'are not numberic'
if not diff:
res = (a==b)
return (res)
s = '123456'
print s
print 'Changes\t\t\tErrorness'
print 'not changed\t\t', distanceLev(s,'123456')
print 'Not all symbols(-1)\t', distanceLev(s,'12345')
print 'Not all symbols(-2)\t', distanceLev(s,'1234')
print 'Not all symbols(-3)\t', distanceLev(s,'123')
print 'Not all symbols(-4)\t', distanceLev(s,'12')
print 'Not all symbols(-5)\t', distanceLev(s,'1')
print 'Removed all symbols\t', distanceLev(s,'')
print 'changed +1\t\t', distanceLev(s,'123556')
print 'changed +2\t\t', distanceLev(s,'123656')
print 'changed +1,+1\t\t', distanceLev(s,'124556')
print 'Some extra symbols\t', distanceLev(s,'12344556')
print 'More extra symbols\t', distanceLev(s,'123445556')