Last active
October 14, 2021 15:41
-
-
Save ssajous/3539848 to your computer and use it in GitHub Desktop.
Implementations of Dice's Coefficient used to get a similarity index between two strings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static double DiceCoefficient(string stOne, string stTwo) | |
{ | |
HashSet<string> nx = BuildBigramSet(stOne); | |
HashSet<string> ny = BuildBigramSet(stTwo); | |
HashSet<string> intersection = new HashSet<string>(nx); | |
intersection.IntersectWith(ny); | |
double dbOne = intersection.Count; | |
return (2 * dbOne) / (nx.Count + ny.Count); | |
} | |
public static HashSet<string> BuildBigramSet(string input) | |
{ | |
HashSet<string> bigrams = new HashSet<string>(); | |
for (int i=0; i < input.Length - 1; i++) | |
{ | |
bigrams.Add(input.Substring(i, 2)); | |
} | |
return bigrams; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_bigram_set(input_str): | |
bigrams = set() | |
for i in range(len(input_str)): | |
bigrams.add(input_str[i:i+2]) | |
return bigrams | |
def dice_coefficient(lhs, rhs): | |
lhs_bigrams = build_bigram_set(lhs) | |
rhs_bigrams = build_bigram_set(rhs) | |
intersection = lhs_bigrams & rhs_bigrams | |
intersection_count = len(intersection) | |
total_count = len(lhs_bigrams) + len(rhs_bigrams) | |
return (2 * intersection_count) / total_count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE FUNCTION [ETL].[DiceCoefficient] | |
( | |
@strOne nvarchar(2048), | |
@strTwo nvarchar(2048) | |
) | |
RETURNS FLOAT | |
AS | |
BEGIN | |
declare @set1 table (bigram nchar(2) not null); | |
declare @set2 table (bigram nchar(2) not null); | |
declare @intersection table (bigram nchar(2) not null); | |
declare @i int = 1; | |
while (@i < LEN(@strOne)) | |
begin | |
insert @set1 | |
values (lower(SUBSTRING(@strOne, @i, 2))) | |
set @i = @i + 1; | |
end | |
set @i = 1; | |
while (@i < LEN(@strTwo)) | |
begin | |
insert @set2 | |
values (lower(SUBSTRING(@strTwo, @i, 2))) | |
set @i = @i + 1; | |
end | |
insert @intersection | |
select distinct bigram | |
from @set1 | |
intersect | |
select distinct bigram | |
from @set2 | |
declare @result float = (select (2.00 * (select cast(COUNT(*) as float) from @intersection)) / | |
((select cast(COUNT(*) as float) from @set1) + (select cast(COUNT(*) as float) from @set2))) | |
RETURN @result | |
END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment