Levenshtein distance

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
def distanceLev(a, b, \
add_cost=10, \
del_cost=10, \
change_cost=5, \
change_modifier=3, \
k_error = 1): #error koeff
"Calculates the Levenshtein distance between a and b."
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
maximum = m * max(add_cost,del_cost,change_cost) * k_error
try:
koeff = 1.*max(m,n) / (min(m,n)) / (m+n)
except:
koeff = 1
full = 0
if m==0 and n==0:
res = 0;
elif m == 0:
res = n * add_cost
elif n == 0:
res = m * del_cost
elif a in b:
res = (m-n) * del_cost
elif b in a:
res = (n-m) * add_cost
else:
current_row = range(n+1) # Keep current and previous row, not entire matrix
koeff = 1.*max(m,n) / (min(m,n)) / (m+n)
for i in xrange(1, m+1):
if full: break
previous_row = current_row
current_row = [i]+[0]*m
for j in xrange(1,n+1):
add = previous_row[j] + add_cost
delete = current_row[j-1] + del_cost
change = previous_row[j-1]
if a[j-1] != b[i-1]:
change += comparer(a[j-1],b[i-1])*change_modifier + change_cost
current_row[j] = min(add, delete, change)
if (koeff*current_row[i] >= maximum) and (maximum != None):
full = 1
current_row[n] = current_row[i]
break
res = current_row[n]
return koeff * res * 100 / maximum / k_error
def comparer(a,b,diff=1,mod=1):
"""Returns difference of 'a' and 'b'
if (diff=0) or 'a' and 'b' are not numeric just checks if they are equal
'mod=1' causes function to return abs() of result"""
try:
if type(a) == type(b) == type('str'):
res = ord(a) - ord(b)
else:
res = (a-b)
except:
res = (a==b)
if mod:
try: res = abs(res)
except:
print a,b,'are not numberic'
if not diff:
res = (a==b)
return (res)
s = '123456'
print s
print 'Changes\t\t\t Errorness'
print 'not changed\t\t %.2f' % distanceLev(s,'123456')
for i in xrange(len(s)):
print 'Not all symbols(-' +str(i+1) + ')\t %.2f' % distanceLev(s,s[0:-i-1])
print 'changed +1\t\t %.2f' % distanceLev(s,'123556')
print 'changed +2\t\t %.2f' % distanceLev(s,'123656')
print 'changed +3\t\t %.2f' % distanceLev(s,'123756')
print 'changed +1,+1\t\t %.2f' % distanceLev(s,'124556')
print 'Some extra symbols\t %.2f' % distanceLev(s,'12344556')
print 'More extra symbols\t %.2f' % distanceLev(s,'123445556')
print 'doubles\t\t\t %.2f' % distanceLev(s,'112233445566')