Last active
November 16, 2022 17:39
-
-
Save AScriver/fcee1aa1aad2d8c2ae681529032af79c to your computer and use it in GitHub Desktop.
Finds degree of similarity between two strings, based on Dice's Coefficient, using SQL/T-SQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Finds degree of similarity between two strings, based on Dice's Coefficient | |
DROP FUNCTION IF EXISTS dbo.CompareTwoStrings | |
GO | |
CREATE FUNCTION dbo.CompareTwoStrings ( | |
@string1 nvarchar(4000) | |
, @string2 nvarchar(4000) | |
) RETURNS NUMERIC(10, 10) | |
AS BEGIN | |
DECLARE @i INT | |
DECLARE @bigram CHAR(2) | |
DECLARE @count INT | |
DECLARE @intersectionSize INT = 0 | |
DECLARE @firstBigrams TABLE ( | |
letter CHAR(2) COLLATE Latin1_General_CS_AS NOT NULL UNIQUE , | |
cnt INT | |
) | |
SET @i = 0 | |
WHILE @i <= 32 BEGIN | |
SET @string1 = REPLACE(@string1, CHAR(@i), '') | |
SET @string2 = REPLACE(@string2, CHAR(@i), '') | |
SET @i = @i + 1 | |
END | |
IF (@string1 = @string2) RETURN 1 | |
IF (LEN(@string1) < 3 OR LEN(@string2) < 3) RETURN 0 | |
SET @i = 1 | |
WHILE @i < LEN(@string1) BEGIN | |
SET @bigram = SUBSTRING(@string1, @i, @i + 2) | |
IF EXISTS(SELECT * FROM @firstBigrams WHERE letter = @bigram COLLATE Latin1_General_CS_AS) BEGIN | |
UPDATE @firstBigrams SET cnt = cnt + 1 WHERE letter = @bigram COLLATE Latin1_General_CS_AS | |
END ELSE BEGIN | |
INSERT INTO @firstBigrams (letter, cnt) VALUES (@bigram, 1) | |
END | |
SET @i = @i + 1 | |
END | |
SET @i = 1 | |
WHILE @i < LEN(@string2) BEGIN | |
SET @count = NULL | |
SET @bigram = SUBSTRING(@string2, @i, @i + 2) | |
SELECT @count = cnt FROM @firstBigrams WHERE letter = @bigram COLLATE Latin1_General_CS_AS | |
SET @count = ISNULL(@count, 0) | |
IF(@count > 0) BEGIN | |
UPDATE @firstBigrams SET cnt = @count - 1 WHERE letter = @bigram COLLATE Latin1_General_CS_AS | |
SET @intersectionSize = @intersectionSize + 1 | |
END | |
SET @i = @i + 1 | |
END | |
RETURN ((2.0 * @intersectionSize)/(LEN(@string1) + LEN(@string2) - 2)) | |
END | |
GO |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment