latestalexey
diff --git a/‎User_Defined_Function/udf_DamLev.sql‎
Lines changed: 133 additions & 0 deletions b/‎User_Defined_Function/udf_DamLev.sql‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎User_Defined_Function/udf_FuzzySearchOf.sql‎
Lines changed: 67 additions & 0 deletions b/‎User_Defined_Function/udf_FuzzySearchOf.sql‎
Lines changed: 67 additions & 0 deletions
@@ -0,0 +1,133 @@
+-- =============================================
+-- Computes and returns the Damerau-Levenshtein edit distance between two strings, 
+-- i.e. the number of insertion, deletion, substitution, and transposition edits
+-- required to transform one string to the other. This value will be >= 0, where
+-- 0 indicates identical strings. Comparisons use the case-sensitivity configured
+-- in SQL Server (case-insensitive by default). This algorithm is basically the
+-- Levenshtein algorithm with a modification that considers transposition of two
+-- adjacent characters as a single edit.
+-- http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_19.html
+-- See http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
+-- Note that this uses Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
+-- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm.
+-- This version differs by including some optimizations, and extending it to the Damerau-
+-- Levenshtein algorithm.
+-- Note that this is the simpler and faster optimal string alignment (aka restricted edit) distance
+-- that difers slightly from the full Damerau-Levenshtein algorithm by imposing the restriction
+-- that no substring is edited more than once. So for example, "CA" to "ABC" has an edit distance
+-- of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
+-- uses the optimal string alignment algorithm. See wikipedia article for more detail on this
+-- distinction.
+-- 
+-- @s - String being compared for distance.
+-- @t - String being compared against other string.
+-- @max - Maximum distance allowed, or NULL if no maximum is desired. Returns NULL if distance will exceed @max.
+-- returns int edit distance, >= 0 representing the number of edits required to transform one string to the other.
+-- =============================================
+ 
+CREATE FUNCTION [dbo].[DamLev](
+ 
+ @s nvarchar(4000)
+ , @t nvarchar(4000)
+ , @max int
+)
+RETURNS int
+WITH SCHEMABINDING
+AS
+BEGIN
+ DECLARE @distance int = 0 -- return variable
+ , @v0 nvarchar(4000)-- running scratchpad for storing computed distances
+ , @v2 nvarchar(4000)-- running scratchpad for storing previous column's computed distances
+ , @start int = 1 -- index (1 based) of first non-matching character between the two string
+ , @i int, @j int -- loop counters: i for s string and j for t string
+ , @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
+ , @left int -- distance in cell to the left if we were using an m by n matrix
+ , @nextTransCost int-- transposition base cost for next iteration 
+ , @thisTransCost int-- transposition base cost (2 distant along diagonal) for current iteration
+ , @sChar nchar -- character at index i from s string
+ , @tChar nchar -- character at index j from t string
+ , @thisJ int -- temporary storage of @j to allow SELECT combining
+ , @jOffset int -- offset used to calculate starting value for j loop
+ , @jEnd int -- ending value for j loop (stopping point for processing a column)
+ -- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
+ , @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string
+ , @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string
+ , @lenDiff int -- difference in length between the two strings
+ -- if strings of different lengths, ensure shorter string is in s. This can result in a little
+ -- faster speed by spending more time spinning just the inner loop during the main processing.
+ IF (@sLen > @tLen) BEGIN
+ SELECT @v0 = @s, @i = @sLen -- temporarily use v0 for swap
+ SELECT @s = @t, @sLen = @tLen
+ SELECT @t = @v0, @tLen = @i
+ END
+ SELECT @max = ISNULL(@max, @tLen)
+ , @lenDiff = @tLen - @sLen
+ IF @lenDiff > @max RETURN NULL
+ 
+ -- suffix common to both strings can be ignored
+ WHILE(@sLen > 0 AND SUBSTRING(@s, @sLen, 1) = SUBSTRING(@t, @tLen, 1))
+ SELECT @sLen = @sLen - 1, @tLen = @tLen - 1
+ 
+ IF (@sLen = 0) RETURN @tLen
+ 
+ -- prefix common to both strings can be ignored
+ WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1)) 
+ SELECT @start = @start + 1
+ IF (@start > 1) BEGIN
+ SELECT @sLen = @sLen - (@start - 1)
+ , @tLen = @tLen - (@start - 1)
+ 
+ -- if all of shorter string matches prefix and/or suffix of longer string, then
+ -- edit distance is just the delete of additional characters present in longer string
+ IF (@sLen <= 0) RETURN @tLen
+ 
+ SELECT @s = SUBSTRING(@s, @start, @sLen)
+ , @t = SUBSTRING(@t, @start, @tLen)
+ END
+ 
+ -- initialize v0 array of distances
+ SELECT @v0 = '', @j = 1
+ WHILE (@j <= @tLen) BEGIN
+ SELECT @v0 = @v0 + NCHAR(CASE WHEN @j > @max THEN @max ELSE @j END)
+ SELECT @j = @j + 1
+ END
+ 
+ SELECT @v2 = @v0 -- copy...doesn't matter what's in v2, just need to initialize its size
+ , @jOffset = @max - @lenDiff
+ , @i = 1
+ WHILE (@i <= @sLen) BEGIN
+ SELECT @distance = @i
+ , @diag = @i - 1
+ , @sChar = SUBSTRING(@s, @i, 1)
+ -- no need to look beyond window of upper left diagonal (@i) + @max cells
+ -- and the lower right diagonal (@i - @lenDiff) - @max cells
+ , @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END
+ , @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END
+ , @thisTransCost = 0
+ WHILE (@j <= @jEnd) BEGIN
+ -- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix)
+ SELECT @nextTransCost = UNICODE(SUBSTRING(@v2, @j, 1))
+ , @v2 = STUFF(@v2, @j, 1, NCHAR(@diag))
+ , @tChar = SUBSTRING(@t, @j, 1)
+ , @left = UNICODE(SUBSTRING(@v0, @j, 1))
+ , @thisJ = @j
+ SELECT @distance = CASE WHEN @diag < @left AND @diag < @distance THEN @diag --substitution
+ WHEN @left < @distance THEN @left -- insertion
+ ELSE @distance -- deletion
+ END
+ SELECT @distance = CASE WHEN (@sChar = @tChar) THEN @diag -- no change (characters match)
+ WHEN @i <> 1 AND @j <> 1
+ AND @tChar = SUBSTRING(@s, @i - 1, 1)
+ AND @thisTransCost < @distance
+ AND @sChar = SUBSTRING(@t, @j - 1, 1)
+ THEN 1 + @thisTransCost -- transposition
+ ELSE 1 + @distance END
+ SELECT @v0 = STUFF(@v0, @thisJ, 1, NCHAR(@distance))
+ , @diag = @left
+ , @thisTransCost = @nextTransCost
+ , @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff) then @jEnd + 2 else @thisJ + 1 end
+ END
+ SELECT @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END
+ END
+ RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END
+END
@@ -0,0 +1,67 @@
+IF OBJECT_ID (N'dbo.FuzzySearchOf') IS NOT NULL
+ DROP FUNCTION dbo.FuzzySearchOf
+ GO
+ CREATE FUNCTION dbo.FuzzySearchOf(@Searchterm VARCHAR(40))
+ /**
+ summary: >
+ Returns all candidate words even if the input word is misspelt
+ Author: Phil Factor
+ Revision: 1.0
+ date: 16/02/2017
+ example:
+ Select * from dbo.FuzzySearchOf('sossyjez')
+ Select * from dbo.FuzzySearchOf('acheeve')
+ Select * from dbo.FuzzySearchOf('deevyate')--does a 
+ returns: >
+ a table containing words
+ Dependency: 
+ Words: A table of common words
+ DamLev http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_19.html
+ dbo.OneEditDifferenceTo(@word)
+ dbo.metaphone(@searchterm)
+ **/
+ RETURNS @candidates TABLE(Candidate VARCHAR(40))
+ AS
+ -- body of the function
+ BEGIN
+ DECLARE @Rowcount INT;
+ /* The first stage is to see if a word is an alias or a known misspelling.*/
+ INSERT INTO @candidates (Candidate)
+ SELECT COALESCE(words.CanonicalVersion, words.word) FROM dbo.words
+ WHERE words.word = @Searchterm;
+ -- If not a known word or an alias, then has it an edit-distance of one to any canonical words 
+ -- IN the 'Words' table
+ IF @@RowCount = 0
+ BEGIN
+ INSERT INTO @candidates (Candidate)
+ SELECT OneEditDifferenceTo.Candidate FROM dbo.OneEditDifferenceTo(@Searchterm);
+ IF @@RowCount = 0
+ BEGIN --If not then does it share a metaphone with any words in your table?
+ INSERT INTO @candidates (Candidate)
+ SELECT COALESCE(words.CanonicalVersion, words.word) AS candidate
+ FROM dbo.words WHERE words.Metaphone = dbo.Metaphone(@Searchterm);
+ SELECT @Rowcount = @@RowCount;
+ IF @Rowcount > 5 --If yes, and there are too many, then get what there are and 
+ BEGIN --take the top few in ascending edit difference.
+ DELETE FROM @candidates;
+ INSERT INTO @candidates (Candidate)
+ SELECT TOP 5 COALESCE(words.CanonicalVersion, words.word) AS candidate
+ FROM dbo.words WHERE words.Metaphone = dbo.Metaphone(@Searchterm)
+ ORDER BY COALESCE(dbo.DamLev(words.word, @Searchterm, 3), 4); --just do three levels
+ END;
+ IF @Rowcount = 0
+ BEGIN
+ /* Get a limited number of words with an edit distance of two, using Steve Hatchett’s 
+	version of the Damerau-Levenshtein Algorithm, specifying that it abandons its work 
+	on a particular word once it realises that it is more than two edit distances away*/
+ INSERT INTO @candidates (Candidate)
+ SELECT TOP 5 words.word
+ FROM dbo.words WHERE words.CanonicalVersion IS NULL
+ AND word LIKE LEFT(@Searchterm,1)+'%'
+ AND COALESCE(dbo.DamLev(words.word, @Searchterm, 2), 3) < 3;
+ END;
+ END;
+ END;
+ RETURN;
+ END;
+ GO