Skip to content

Instantly share code, notes, and snippets.

@NReilingh
Last active January 2, 2023 21:13
Show Gist options
  • Save NReilingh/f5ca23713278ef050f8f41b2f6d08413 to your computer and use it in GitHub Desktop.
Save NReilingh/f5ca23713278ef050f8f41b2f6d08413 to your computer and use it in GitHub Desktop.
Sorenson-dice coefficient in a set-based style. Note -- this is an excerpt and probably does not compile, but may be adapted to your use case. This example is implemented in T-SQL (MS SQL Server)
SET ANSI_NULLS, QUOTED_IDENTIFIER ON;
GO
/***********************************************************************************************************************
Author: Nick Reilingh, Fisher Center at Bard College
Description: Preload for sorensen_dice coefficient word pairs on names.
Notes:
- This procedure is called from LP_FZM_PRELOAD, which supplies it with a dataset_no parameter
- It is supplied with dataset parameter from this procedure.
- This procedure populates a temp table with letter pairs for a dataset to be matched against later.
***********************************************************************************************************************/
CREATE OR ALTER PROCEDURE [fzm].[sorensen_dice_names_PRELOAD]
@dataset_no int = NULL
AS
WITH numbers AS (
SELECT
number
FROM master.dbo.spt_values
WHERE type = 'P' AND number < 100
), loading_names AS (
SELECT
n.customer_no,
name = ISNULL(NULLIF(n.fname, '') + ' ', '') + ISNULL(n.lname, ''),
name_no = ROW_NUMBER() OVER (PARTITION BY n.customer_no ORDER BY (SELECT 1))
FROM dbo.LS_FZM_NAME n
), wordboundaries AS (
SELECT
n.customer_no,
n.name_no,
n.name,
a.word
FROM loading_names n
CROSS APPLY (
SELECT
word = RTRIM(SUBSTRING(n.name, number + 1, PATINDEX('%[ ][^ ]%', RIGHT(n.name + ' $', DATALENGTH(n.name) + 1 - number))))
FROM numbers
WHERE number <= DATALENGTH(n.name)
AND PATINDEX('[ ][^ ]%', RIGHT(' ' + n.name, DATALENGTH(n.name) + 1 - number)) = 1
) a
), wordpairs AS (
SELECT
w.customer_no,
w.name_no,
w.name,
a.letterpair
FROM wordboundaries w
CROSS APPLY (
SELECT
letterpair = SUBSTRING(w.word, number + 1, 2)
FROM numbers
WHERE number < DATALENGTH(w.word) - 1
) a
), letterpairs_enumerated AS (
SELECT
p.customer_no,
p.name_no,
p.name,
p.letterpair,
letteridx = ROW_NUMBER() OVER (PARTITION BY customer_no, name_no, letterpair ORDER BY (SELECT 1)),
paircount = COUNT(1) OVER (PARTITION BY customer_no, name_no)
FROM wordpairs p
)
INSERT tempdb.dbo.[fzm_sorensen_dice_names_tess_letterpairs]
SELECT * FROM letterpairs_enumerated;
GO
/***********************************************************************************************************************
Author: Nick Reilingh, Fisher Center at Bard College
Description: Calculates sorensen_dice coefficient. Requires running Preload procedure in the same session.
***********************************************************************************************************************/
CREATE OR ALTER FUNCTION [fzm].[sorensen_dice_names](
@dataset_no int,
@primary_id varchar(16)
)
RETURNS TABLE AS RETURN
WITH numbers AS (
SELECT
number
FROM master.dbo.spt_values
WHERE type = 'P' AND number < 100
), primary_id_names AS (
SELECT
name = ISNULL(NULLIF(n.fname, '') + ' ', '') + ISNULL(lname, ''),
name_no = ROW_NUMBER() OVER (ORDER BY (SELECT 1))
FROM dbo.LS_FZM_NAME_INPUT n
WHERE n.dataset_no = @dataset_no
AND n.primary_id = @primary_id
), wordboundaries AS (
SELECT
n.name_no,
a.word
FROM primary_id_names n
CROSS APPLY (
SELECT
word = RTRIM(SUBSTRING(n.name, number + 1, PATINDEX('%[ ][^ ]%', RIGHT(n.name + ' $', DATALENGTH(n.name) + 1 - number))))
FROM numbers
WHERE number <= DATALENGTH(n.name)
AND PATINDEX('[ ][^ ]%', RIGHT(' ' + n.name, DATALENGTH(n.name) + 1 - number)) = 1
) a
), wordpairs AS (
SELECT
w.name_no,
a.letterpair
FROM wordboundaries w
CROSS APPLY (
SELECT
letterpair = SUBSTRING(w.word, number + 1, 2)
FROM numbers
WHERE number < DATALENGTH(w.word) - 1
) a
), letterpairs_enumerated AS (
SELECT
p.name_no,
p.letterpair,
letteridx = ROW_NUMBER() OVER (PARTITION BY name_no, letterpair ORDER BY (SELECT 1)),
paircount = COUNT(1) OVER (PARTITION BY name_no)
FROM wordpairs p
), scored AS (
SELECT DISTINCT
t.customer_no,
score = 200.0 * (COUNT(1) OVER (PARTITION BY t.customer_no, t.name_no, i.name_no)) / (t.paircount + i.paircount),
t.name
FROM tempdb.dbo.[fzm_sorensen_dice_names_tess_letterpairs] t
JOIN letterpairs_enumerated i
ON t.letterpair = i.letterpair
AND t.letteridx = i.letteridx
), ordered AS (
SELECT
customer_no,
score = CAST(ROUND(score, 0) AS int),
criteria = name,
[order] = ROW_NUMBER() OVER (PARTITION BY customer_no ORDER BY score DESC)
FROM scored
WHERE score >= 45
)
SELECT
customer_no,
score,
criteria
FROM ordered WHERE [order] = 1;
GO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment