Skip to content

Commit 1e881f2

Browse files
committed
Add udf_DamLev and udf_FuzzySearchOf
1 parent 7380b07 commit 1e881f2

File tree

2 files changed

+200
-0
lines changed

2 files changed

+200
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
-- =============================================
2+
-- Computes and returns the Damerau-Levenshtein edit distance between two strings,
3+
-- i.e. the number of insertion, deletion, substitution, and transposition edits
4+
-- required to transform one string to the other. This value will be >= 0, where
5+
-- 0 indicates identical strings. Comparisons use the case-sensitivity configured
6+
-- in SQL Server (case-insensitive by default). This algorithm is basically the
7+
-- Levenshtein algorithm with a modification that considers transposition of two
8+
-- adjacent characters as a single edit.
9+
-- http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_19.html
10+
-- See http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
11+
-- Note that this uses Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
12+
-- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm.
13+
-- This version differs by including some optimizations, and extending it to the Damerau-
14+
-- Levenshtein algorithm.
15+
-- Note that this is the simpler and faster optimal string alignment (aka restricted edit) distance
16+
-- that difers slightly from the full Damerau-Levenshtein algorithm by imposing the restriction
17+
-- that no substring is edited more than once. So for example, "CA" to "ABC" has an edit distance
18+
-- of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
19+
-- uses the optimal string alignment algorithm. See wikipedia article for more detail on this
20+
-- distinction.
21+
--
22+
-- @s - String being compared for distance.
23+
-- @t - String being compared against other string.
24+
-- @max - Maximum distance allowed, or NULL if no maximum is desired. Returns NULL if distance will exceed @max.
25+
-- returns int edit distance, >= 0 representing the number of edits required to transform one string to the other.
26+
-- =============================================
27+
28+
CREATE FUNCTION [dbo].[DamLev](
29+
30+
@s nvarchar(4000)
31+
, @t nvarchar(4000)
32+
, @max int
33+
)
34+
RETURNS int
35+
WITH SCHEMABINDING
36+
AS
37+
BEGIN
38+
DECLARE @distance int = 0 -- return variable
39+
, @v0 nvarchar(4000)-- running scratchpad for storing computed distances
40+
, @v2 nvarchar(4000)-- running scratchpad for storing previous column's computed distances
41+
, @start int = 1 -- index (1 based) of first non-matching character between the two string
42+
, @i int, @j int -- loop counters: i for s string and j for t string
43+
, @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
44+
, @left int -- distance in cell to the left if we were using an m by n matrix
45+
, @nextTransCost int-- transposition base cost for next iteration
46+
, @thisTransCost int-- transposition base cost (2 distant along diagonal) for current iteration
47+
, @sChar nchar -- character at index i from s string
48+
, @tChar nchar -- character at index j from t string
49+
, @thisJ int -- temporary storage of @j to allow SELECT combining
50+
, @jOffset int -- offset used to calculate starting value for j loop
51+
, @jEnd int -- ending value for j loop (stopping point for processing a column)
52+
-- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
53+
, @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string
54+
, @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string
55+
, @lenDiff int -- difference in length between the two strings
56+
-- if strings of different lengths, ensure shorter string is in s. This can result in a little
57+
-- faster speed by spending more time spinning just the inner loop during the main processing.
58+
IF (@sLen > @tLen) BEGIN
59+
SELECT @v0 = @s, @i = @sLen -- temporarily use v0 for swap
60+
SELECT @s = @t, @sLen = @tLen
61+
SELECT @t = @v0, @tLen = @i
62+
END
63+
SELECT @max = ISNULL(@max, @tLen)
64+
, @lenDiff = @tLen - @sLen
65+
IF @lenDiff > @max RETURN NULL
66+
67+
-- suffix common to both strings can be ignored
68+
WHILE(@sLen > 0 AND SUBSTRING(@s, @sLen, 1) = SUBSTRING(@t, @tLen, 1))
69+
SELECT @sLen = @sLen - 1, @tLen = @tLen - 1
70+
71+
IF (@sLen = 0) RETURN @tLen
72+
73+
-- prefix common to both strings can be ignored
74+
WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1))
75+
SELECT @start = @start + 1
76+
IF (@start > 1) BEGIN
77+
SELECT @sLen = @sLen - (@start - 1)
78+
, @tLen = @tLen - (@start - 1)
79+
80+
-- if all of shorter string matches prefix and/or suffix of longer string, then
81+
-- edit distance is just the delete of additional characters present in longer string
82+
IF (@sLen <= 0) RETURN @tLen
83+
84+
SELECT @s = SUBSTRING(@s, @start, @sLen)
85+
, @t = SUBSTRING(@t, @start, @tLen)
86+
END
87+
88+
-- initialize v0 array of distances
89+
SELECT @v0 = '', @j = 1
90+
WHILE (@j <= @tLen) BEGIN
91+
SELECT @v0 = @v0 + NCHAR(CASE WHEN @j > @max THEN @max ELSE @j END)
92+
SELECT @j = @j + 1
93+
END
94+
95+
SELECT @v2 = @v0 -- copy...doesn't matter what's in v2, just need to initialize its size
96+
, @jOffset = @max - @lenDiff
97+
, @i = 1
98+
WHILE (@i <= @sLen) BEGIN
99+
SELECT @distance = @i
100+
, @diag = @i - 1
101+
, @sChar = SUBSTRING(@s, @i, 1)
102+
-- no need to look beyond window of upper left diagonal (@i) + @max cells
103+
-- and the lower right diagonal (@i - @lenDiff) - @max cells
104+
, @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END
105+
, @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END
106+
, @thisTransCost = 0
107+
WHILE (@j <= @jEnd) BEGIN
108+
-- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix)
109+
SELECT @nextTransCost = UNICODE(SUBSTRING(@v2, @j, 1))
110+
, @v2 = STUFF(@v2, @j, 1, NCHAR(@diag))
111+
, @tChar = SUBSTRING(@t, @j, 1)
112+
, @left = UNICODE(SUBSTRING(@v0, @j, 1))
113+
, @thisJ = @j
114+
SELECT @distance = CASE WHEN @diag < @left AND @diag < @distance THEN @diag --substitution
115+
WHEN @left < @distance THEN @left -- insertion
116+
ELSE @distance -- deletion
117+
END
118+
SELECT @distance = CASE WHEN (@sChar = @tChar) THEN @diag -- no change (characters match)
119+
WHEN @i <> 1 AND @j <> 1
120+
AND @tChar = SUBSTRING(@s, @i - 1, 1)
121+
AND @thisTransCost < @distance
122+
AND @sChar = SUBSTRING(@t, @j - 1, 1)
123+
THEN 1 + @thisTransCost -- transposition
124+
ELSE 1 + @distance END
125+
SELECT @v0 = STUFF(@v0, @thisJ, 1, NCHAR(@distance))
126+
, @diag = @left
127+
, @thisTransCost = @nextTransCost
128+
, @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff) then @jEnd + 2 else @thisJ + 1 end
129+
END
130+
SELECT @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END
131+
END
132+
RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END
133+
END
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
IF OBJECT_ID (N'dbo.FuzzySearchOf') IS NOT NULL
2+
DROP FUNCTION dbo.FuzzySearchOf
3+
GO
4+
CREATE FUNCTION dbo.FuzzySearchOf(@Searchterm VARCHAR(40))
5+
/**
6+
summary: >
7+
Returns all candidate words even if the input word is misspelt
8+
Author: Phil Factor
9+
Revision: 1.0
10+
date: 16/02/2017
11+
example:
12+
Select * from dbo.FuzzySearchOf('sossyjez')
13+
Select * from dbo.FuzzySearchOf('acheeve')
14+
Select * from dbo.FuzzySearchOf('deevyate')--does a
15+
returns: >
16+
a table containing words
17+
Dependency:
18+
Words: A table of common words
19+
DamLev http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_19.html
20+
dbo.OneEditDifferenceTo(@word)
21+
dbo.metaphone(@searchterm)
22+
**/
23+
RETURNS @candidates TABLE(Candidate VARCHAR(40))
24+
AS
25+
-- body of the function
26+
BEGIN
27+
DECLARE @Rowcount INT;
28+
/* The first stage is to see if a word is an alias or a known misspelling.*/
29+
INSERT INTO @candidates (Candidate)
30+
SELECT COALESCE(words.CanonicalVersion, words.word) FROM dbo.words
31+
WHERE words.word = @Searchterm;
32+
-- If not a known word or an alias, then has it an edit-distance of one to any canonical words
33+
-- IN the 'Words' table
34+
IF @@RowCount = 0
35+
BEGIN
36+
INSERT INTO @candidates (Candidate)
37+
SELECT OneEditDifferenceTo.Candidate FROM dbo.OneEditDifferenceTo(@Searchterm);
38+
IF @@RowCount = 0
39+
BEGIN --If not then does it share a metaphone with any words in your table?
40+
INSERT INTO @candidates (Candidate)
41+
SELECT COALESCE(words.CanonicalVersion, words.word) AS candidate
42+
FROM dbo.words WHERE words.Metaphone = dbo.Metaphone(@Searchterm);
43+
SELECT @Rowcount = @@RowCount;
44+
IF @Rowcount > 5 --If yes, and there are too many, then get what there are and
45+
BEGIN --take the top few in ascending edit difference.
46+
DELETE FROM @candidates;
47+
INSERT INTO @candidates (Candidate)
48+
SELECT TOP 5 COALESCE(words.CanonicalVersion, words.word) AS candidate
49+
FROM dbo.words WHERE words.Metaphone = dbo.Metaphone(@Searchterm)
50+
ORDER BY COALESCE(dbo.DamLev(words.word, @Searchterm, 3), 4); --just do three levels
51+
END;
52+
IF @Rowcount = 0
53+
BEGIN
54+
/* Get a limited number of words with an edit distance of two, using Steve Hatchett’s
55+
version of the Damerau-Levenshtein Algorithm, specifying that it abandons its work
56+
on a particular word once it realises that it is more than two edit distances away*/
57+
INSERT INTO @candidates (Candidate)
58+
SELECT TOP 5 words.word
59+
FROM dbo.words WHERE words.CanonicalVersion IS NULL
60+
AND word LIKE LEFT(@Searchterm,1)+'%'
61+
AND COALESCE(dbo.DamLev(words.word, @Searchterm, 2), 3) < 3;
62+
END;
63+
END;
64+
END;
65+
RETURN;
66+
END;
67+
GO

0 commit comments

Comments
 (0)