# -*- coding: utf-8 -*-
def distanceLev(a, b, \
add_cost=1, \
del_cost=0.5, \
change_cost=0.5, \
change_modif=1.0, \
partial_koeff=0.7,\
extra_koeff=0.5, \
k_error = 1): #error triger koeff
"""Calculates the Levenshtein distance between a and b.
Stops working if current error is bigger then add_cost*len(a)*k_error
costs must be lesser then 1"""
if add_cost>1 or del_cost>1 or change_cost>1:
raise ValueError, \
'add_cost, del_cost, change_cost+change_miduf must be lesser then 0'
n, m = len(a), len(b)
maximum = n * add_cost * k_error
koeff = 100./(n*add_cost)
full = 0
if n == 0:
res = m * del_cost
elif m == 0:
res = n * add_cost
elif a in b:
res = (m-n) * del_cost * extra_koeff
elif b in a:
res = (n-m) * add_cost * partial_koeff
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
add_cost, del_cost = del_cost, add_cost
if m==0 and n==0:
res = 0;
elif m == 0 or n == 0 or (a in b) or (b in a):
pass
else:
current_row = range(n+1) # Keep current and previous row, not entire matrix
for i in xrange(1, m+1):
if full: break
previous_row = current_row
current_row = [i]+[0]*m
for j in xrange(1,n+1):
add = previous_row[j] + add_cost
delete = current_row[j-1] + del_cost
change = previous_row[j-1]
if a[j-1] != b[i-1]:
x,y = a[j-1],b[i-1]
change += comparer(x,y,normer)*change_modif + change_cost
current_row[j] = min(add, delete, change)
if (current_row[i] >= maximum):
full = 1
current_row[n] = current_row[i]
break
res = current_row[n]
return koeff * res
def comparer(a,b,diff=1,mod=1,relative=1,normer=1):
"""Returns difference of 'a' and 'b'
if (diff=0) or 'a' and 'b' are not numeric just checks if they are equal
'mod=1' causes function to return abs() of result"""
normer += 1
try:
if type(a) == type(b) == type('str'):
a,b = ord(a),ord(b)
res = (a-b)
k = a
except:
res = (a==b)
k = 1
if mod:
try: res = abs(res)
except: print a,b,'are not numberic'
if not diff:
res = (a==b)
if relative:
res = 1.*res/k
return (res)
s = '123456'
print s
print 'Changes\t\t\t Errorness'
print 'not changed\t\t %.2f' % distanceLev(s,'123456')
for i in xrange(len(s)):
print 'Not all symbols(-' +str(i+1) + ')\t %.2f' % distanceLev(s,s[0:-i-1])
print 'changed +2\t\t %.2f' % distanceLev(s,'123656')
print 'changed +4\t\t %.2f' % distanceLev(s,'123856')
print 'changed +5\t\t %.2f' % distanceLev(s,'123956')
print 'changed +1,+1\t\t %.2f' % distanceLev(s,'124556')
print '+2 symbols inside\t %.2f' % distanceLev(s,'12344556')
print '+3 symbols inside\t %.2f' % distanceLev(s,'123445556')
print '+2 symbols outside\t %.2f' % distanceLev(s,'123456dj')
print '+3 symbols outside\t %.2f' % distanceLev(s,'123456djb')
print 'doubles\t\t\t %.2f' % distanceLev(s,'112233445566')
print 'triples\t\t\t %.2f' % distanceLev(s,'111222333444555666')