# -*- coding: utf-8 -*- def distanceLev(a, b, \ add_cost=1, \ del_cost=0.5, \ change_cost=0.5, \ change_modif=1.0, \ partial_koeff=0.7,\ extra_koeff=0.5, \ k_error = 1): #error triger koeff """Calculates the Levenshtein distance between a and b. Stops working if current error is bigger then add_cost*len(a)*k_error costs must be lesser then 1""" if add_cost>1 or del_cost>1 or change_cost>1: raise ValueError, \ 'add_cost, del_cost, change_cost+change_miduf must be lesser then 0' n, m = len(a), len(b) maximum = n * add_cost * k_error koeff = 100./(n*add_cost) full = 0 if n == 0: res = m * del_cost elif m == 0: res = n * add_cost elif a in b: res = (m-n) * del_cost * extra_koeff elif b in a: res = (n-m) * add_cost * partial_koeff if n > m: # Make sure n <= m, to use O(min(n,m)) space a, b = b, a n, m = m, n add_cost, del_cost = del_cost, add_cost if m==0 and n==0: res = 0; elif m == 0 or n == 0 or (a in b) or (b in a): pass else: current_row = range(n+1) # Keep current and previous row, not entire matrix for i in xrange(1, m+1): if full: break previous_row = current_row current_row = [i]+[0]*m for j in xrange(1,n+1): add = previous_row[j] + add_cost delete = current_row[j-1] + del_cost change = previous_row[j-1] if a[j-1] != b[i-1]: x,y = a[j-1],b[i-1] change += comparer(x,y,normer)*change_modif + change_cost current_row[j] = min(add, delete, change) if (current_row[i] >= maximum): full = 1 current_row[n] = current_row[i] break res = current_row[n] return koeff * res def comparer(a,b,diff=1,mod=1,relative=1,normer=1): """Returns difference of 'a' and 'b' if (diff=0) or 'a' and 'b' are not numeric just checks if they are equal 'mod=1' causes function to return abs() of result""" normer += 1 try: if type(a) == type(b) == type('str'): a,b = ord(a),ord(b) res = (a-b) k = a except: res = (a==b) k = 1 if mod: try: res = abs(res) except: print a,b,'are not numberic' if not diff: res = (a==b) if relative: res = 1.*res/k return (res) s = '123456' print s print 'Changes\t\t\t Errorness' print 'not changed\t\t %.2f' % distanceLev(s,'123456') for i in xrange(len(s)): print 'Not all symbols(-' +str(i+1) + ')\t %.2f' % distanceLev(s,s[0:-i-1]) print 'changed +2\t\t %.2f' % distanceLev(s,'123656') print 'changed +4\t\t %.2f' % distanceLev(s,'123856') print 'changed +5\t\t %.2f' % distanceLev(s,'123956') print 'changed +1,+1\t\t %.2f' % distanceLev(s,'124556') print '+2 symbols inside\t %.2f' % distanceLev(s,'12344556') print '+3 symbols inside\t %.2f' % distanceLev(s,'123445556') print '+2 symbols outside\t %.2f' % distanceLev(s,'123456dj') print '+3 symbols outside\t %.2f' % distanceLev(s,'123456djb') print 'doubles\t\t\t %.2f' % distanceLev(s,'112233445566') print 'triples\t\t\t %.2f' % distanceLev(s,'111222333444555666')