Skip to content

Instantly share code, notes, and snippets.

@ssajous
Last active October 14, 2021 15:41
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssajous/3539848 to your computer and use it in GitHub Desktop.
Save ssajous/3539848 to your computer and use it in GitHub Desktop.
Implementations of Dice's Coefficient used to get a similarity index between two strings.
public static double DiceCoefficient(string stOne, string stTwo)
{
HashSet<string> nx = BuildBigramSet(stOne);
HashSet<string> ny = BuildBigramSet(stTwo);
HashSet<string> intersection = new HashSet<string>(nx);
intersection.IntersectWith(ny);
double dbOne = intersection.Count;
return (2 * dbOne) / (nx.Count + ny.Count);
}
public static HashSet<string> BuildBigramSet(string input)
{
HashSet<string> bigrams = new HashSet<string>();
for (int i=0; i < input.Length - 1; i++)
{
bigrams.Add(input.Substring(i, 2));
}
return bigrams;
}
def build_bigram_set(input_str):
bigrams = set()
for i in range(len(input_str)):
bigrams.add(input_str[i:i+2])
return bigrams
def dice_coefficient(lhs, rhs):
lhs_bigrams = build_bigram_set(lhs)
rhs_bigrams = build_bigram_set(rhs)
intersection = lhs_bigrams & rhs_bigrams
intersection_count = len(intersection)
total_count = len(lhs_bigrams) + len(rhs_bigrams)
return (2 * intersection_count) / total_count
CREATE FUNCTION [ETL].[DiceCoefficient]
(
@strOne nvarchar(2048),
@strTwo nvarchar(2048)
)
RETURNS FLOAT
AS
BEGIN
declare @set1 table (bigram nchar(2) not null);
declare @set2 table (bigram nchar(2) not null);
declare @intersection table (bigram nchar(2) not null);
declare @i int = 1;
while (@i < LEN(@strOne))
begin
insert @set1
values (lower(SUBSTRING(@strOne, @i, 2)))
set @i = @i + 1;
end
set @i = 1;
while (@i < LEN(@strTwo))
begin
insert @set2
values (lower(SUBSTRING(@strTwo, @i, 2)))
set @i = @i + 1;
end
insert @intersection
select distinct bigram
from @set1
intersect
select distinct bigram
from @set2
declare @result float = (select (2.00 * (select cast(COUNT(*) as float) from @intersection)) /
((select cast(COUNT(*) as float) from @set1) + (select cast(COUNT(*) as float) from @set2)))
RETURN @result
END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment