1+ -- =============================================
2+ -- Computes and returns the Damerau-Levenshtein edit distance between two strings,
3+ -- i.e. the number of insertion, deletion, substitution, and transposition edits
4+ -- required to transform one string to the other. This value will be >= 0, where
5+ -- 0 indicates identical strings. Comparisons use the case-sensitivity configured
6+ -- in SQL Server (case-insensitive by default). This algorithm is basically the
7+ -- Levenshtein algorithm with a modification that considers transposition of two
8+ -- adjacent characters as a single edit.
9+ -- http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_19.html
10+ -- See http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
11+ -- Note that this uses Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
12+ -- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm.
13+ -- This version differs by including some optimizations, and extending it to the Damerau-
14+ -- Levenshtein algorithm.
15+ -- Note that this is the simpler and faster optimal string alignment (aka restricted edit) distance
16+ -- that difers slightly from the full Damerau-Levenshtein algorithm by imposing the restriction
17+ -- that no substring is edited more than once. So for example, "CA" to "ABC" has an edit distance
18+ -- of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
19+ -- uses the optimal string alignment algorithm. See wikipedia article for more detail on this
20+ -- distinction.
21+ --
22+ -- @s - String being compared for distance.
23+ -- @t - String being compared against other string.
24+ -- @max - Maximum distance allowed, or NULL if no maximum is desired. Returns NULL if distance will exceed @max.
25+ -- returns int edit distance, >= 0 representing the number of edits required to transform one string to the other.
26+ -- =============================================
27+
28+ CREATE FUNCTION [dbo].[DamLev](
29+
30+ @s nvarchar (4000 )
31+ , @t nvarchar (4000 )
32+ , @max int
33+ )
34+ RETURNS int
35+ WITH SCHEMABINDING
36+ AS
37+ BEGIN
38+ DECLARE @distance int = 0 -- return variable
39+ , @v0 nvarchar (4000 )-- running scratchpad for storing computed distances
40+ , @v2 nvarchar (4000 )-- running scratchpad for storing previous column's computed distances
41+ , @start int = 1 -- index (1 based) of first non-matching character between the two string
42+ , @i int , @j int -- loop counters: i for s string and j for t string
43+ , @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
44+ , @left int -- distance in cell to the left if we were using an m by n matrix
45+ , @nextTransCost int -- transposition base cost for next iteration
46+ , @thisTransCost int -- transposition base cost (2 distant along diagonal) for current iteration
47+ , @sChar nchar -- character at index i from s string
48+ , @tChar nchar -- character at index j from t string
49+ , @thisJ int -- temporary storage of @j to allow SELECT combining
50+ , @jOffset int -- offset used to calculate starting value for j loop
51+ , @jEnd int -- ending value for j loop (stopping point for processing a column)
52+ -- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
53+ , @sLen int = datalength(@s) / datalength(left (left (@s, 1 ) + ' .' , 1 )) -- length of smaller string
54+ , @tLen int = datalength(@t) / datalength(left (left (@t, 1 ) + ' .' , 1 )) -- length of larger string
55+ , @lenDiff int -- difference in length between the two strings
56+ -- if strings of different lengths, ensure shorter string is in s. This can result in a little
57+ -- faster speed by spending more time spinning just the inner loop during the main processing.
58+ IF (@sLen > @tLen) BEGIN
59+ SELECT @v0 = @s, @i = @sLen -- temporarily use v0 for swap
60+ SELECT @s = @t, @sLen = @tLen
61+ SELECT @t = @v0, @tLen = @i
62+ END
63+ SELECT @max = ISNULL (@max, @tLen)
64+ , @lenDiff = @tLen - @sLen
65+ IF @lenDiff > @max RETURN NULL
66+
67+ -- suffix common to both strings can be ignored
68+ WHILE (@sLen > 0 AND SUBSTRING (@s, @sLen, 1 ) = SUBSTRING (@t, @tLen, 1 ))
69+ SELECT @sLen = @sLen - 1 , @tLen = @tLen - 1
70+
71+ IF (@sLen = 0 ) RETURN @tLen
72+
73+ -- prefix common to both strings can be ignored
74+ WHILE (@start < @sLen AND SUBSTRING (@s, @start, 1 ) = SUBSTRING (@t, @start, 1 ))
75+ SELECT @start = @start + 1
76+ IF (@start > 1 ) BEGIN
77+ SELECT @sLen = @sLen - (@start - 1 )
78+ , @tLen = @tLen - (@start - 1 )
79+
80+ -- if all of shorter string matches prefix and/or suffix of longer string, then
81+ -- edit distance is just the delete of additional characters present in longer string
82+ IF (@sLen <= 0 ) RETURN @tLen
83+
84+ SELECT @s = SUBSTRING (@s, @start, @sLen)
85+ , @t = SUBSTRING (@t, @start, @tLen)
86+ END
87+
88+ -- initialize v0 array of distances
89+ SELECT @v0 = ' ' , @j = 1
90+ WHILE (@j <= @tLen) BEGIN
91+ SELECT @v0 = @v0 + NCHAR (CASE WHEN @j > @max THEN @max ELSE @j END )
92+ SELECT @j = @j + 1
93+ END
94+
95+ SELECT @v2 = @v0 -- copy...doesn't matter what's in v2, just need to initialize its size
96+ , @jOffset = @max - @lenDiff
97+ , @i = 1
98+ WHILE (@i <= @sLen) BEGIN
99+ SELECT @distance = @i
100+ , @diag = @i - 1
101+ , @sChar = SUBSTRING (@s, @i, 1 )
102+ -- no need to look beyond window of upper left diagonal (@i) + @max cells
103+ -- and the lower right diagonal (@i - @lenDiff) - @max cells
104+ , @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END
105+ , @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END
106+ , @thisTransCost = 0
107+ WHILE (@j <= @jEnd) BEGIN
108+ -- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix)
109+ SELECT @nextTransCost = UNICODE (SUBSTRING (@v2, @j, 1 ))
110+ , @v2 = STUFF (@v2, @j, 1 , NCHAR (@diag))
111+ , @tChar = SUBSTRING (@t, @j, 1 )
112+ , @left = UNICODE (SUBSTRING (@v0, @j, 1 ))
113+ , @thisJ = @j
114+ SELECT @distance = CASE WHEN @diag < @left AND @diag < @distance THEN @diag -- substitution
115+ WHEN @left < @distance THEN @left -- insertion
116+ ELSE @distance -- deletion
117+ END
118+ SELECT @distance = CASE WHEN (@sChar = @tChar) THEN @diag -- no change (characters match)
119+ WHEN @i <> 1 AND @j <> 1
120+ AND @tChar = SUBSTRING (@s, @i - 1 , 1 )
121+ AND @thisTransCost < @distance
122+ AND @sChar = SUBSTRING (@t, @j - 1 , 1 )
123+ THEN 1 + @thisTransCost -- transposition
124+ ELSE 1 + @distance END
125+ SELECT @v0 = STUFF (@v0, @thisJ, 1 , NCHAR (@distance))
126+ , @diag = @left
127+ , @thisTransCost = @nextTransCost
128+ , @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff) then @jEnd + 2 else @thisJ + 1 end
129+ END
130+ SELECT @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END
131+ END
132+ RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END
133+ END
0 commit comments