Bergvca/Name matching in SQL Server.sql

## README.MD

      
    Raw
  

              README.MD
            
          
    Name matching in SQL Server example

The following gist is an example on how to simple clustering in SQL server. In this example I use name matching with a combination of Jaro Winkler and Levensthein as similarity measures. This works well on datasets of around 100.000 items. For larger sets I would recommend something like TF-IDF and n-grams.

  
## Name matching in SQL Server.sql
-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
--
-- Input: cleaned_table: a table with "cleaned" names
-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match.


DROP TABLE #matches
SELECT  a.clean_Name,
        a.uid,
        b.clean_Name clean_name_2,
        b.uind uid_2,
        dbo.calcJaroWinkler(a.Clean_Name, b.Clean_Name) JWscore,
        dbo.CalcLevenshtein(a.Clean_Name, b.Clean_Name) LevenstheinScore,
        dbo.calcJaroWinkler(a.First_Word, b.First_Word) JWscore_First_Word,
        dbo.CalcLevenshtein(a.First_Word, b.First_Word) LevenstheinScore_First_Word,
        a.Nr_of_Words  Nr_of_Words_1,
        b.Nr_of_Words Nr_of_Words_2
INTO #matches
FROM #cleaned_table A
JOIN #cleaned_table B
ON (
      a.Clean_Name = b.Clean_Name
	    OR
	 (
		  dbo.calcLevenshtein(a.Clean_Name, b.Clean_Name) > 90
		  OR [dbo].calcJaroWinkler(a.Clean_Name, b.Clean_Name) > 0.9
		)
	)
	AND a.uid != b.uid;

-- For each UID, create a group to which it belongs. This "Group_ID" is the minimum UID of all the matches this name has.
DROP TABLE TMP_GROUPS;
SELECT CASE WHEN uid < min(uid_2)
			THEN uid
			ELSE min(uid_2)
		END AS Group_ID,
		uid
INTO TMP_GROUPS
FROM #matches
GROUP BY uid;

-- The previous step will, in some cases, generate "orphaned items". E.g. in the case when A is matched with B, and B is matched with C,
-- but C is not matched with A. In this case C will have to be matched with the group A.
DECLARE @Number_of_Lone_Groups INT = (SELECT COUNT(1)
					FROM #matches
					LEFT JOIN TMP_GROUPS Left
					ON #matches.uid = Left.uid
					LEFT JOIN TMP_GROUPS Right
					ON #matches.[uid_2] = Right.UID
					WHERE Left.Group_ID != Right.Group_ID)

--While these orphans exist, find their parents, and update them with the correct Group ID:
WHILE @Number_of_Lone_Groups != 0 BEGIN
	UPDATE Left
		SET Left.Group_ID =
			 CASE WHEN Left.Group_ID > Right.Group_Id
				THEN Right.Group_Id
				ELSE Left.Group_ID
			END
		FROM #matches
		LEFT JOIN TMP_GROUPS Left
		ON #matches.[Organization_id] = Left.organization_id
		LEFT JOIN TMP_GROUPS Right
		ON #matches.[organization_id_2] = Right.organization_id
		WHERE Left.Group_ID != Right.Group_ID


	UPDATE Right
		SET	Right.Group_ID =
			CASE WHEN Left.Group_ID < Right.Group_Id
				 THEN Left.Group_Id
				 ELSE Right.Group_ID
			END
		FROM #matches
		LEFT JOIN TMP_GROUPS Left
		ON #matches.uid = Left.uid
		LEFT JOIN TMP_GROUPS Right
		ON #matches.uid_2 = Right.uid
		WHERE Left.Group_ID != Right.Group_ID

	SET @Number_of_Lone_Groups = (SELECT COUNT(1)
									FROM #matches
									LEFT JOIN TMP_GROUPS Left
									ON #matches.[Organization_id] = Left.organization_id
									LEFT JOIN TMP_GROUPS Right
									ON #matches.[organization_id_2] = Right.organization_id
									WHERE Left.Group_ID != Right.Group_ID)
END
	-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
	--
	-- Input: cleaned_table: a table with "cleaned" names
	-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match.


	DROP TABLE #matches
	SELECT a.clean_Name,
	a.uid,
	b.clean_Name clean_name_2,
	b.uind uid_2,
	dbo.calcJaroWinkler(a.Clean_Name, b.Clean_Name) JWscore,
	dbo.CalcLevenshtein(a.Clean_Name, b.Clean_Name) LevenstheinScore,
	dbo.calcJaroWinkler(a.First_Word, b.First_Word) JWscore_First_Word,
	dbo.CalcLevenshtein(a.First_Word, b.First_Word) LevenstheinScore_First_Word,
	a.Nr_of_Words Nr_of_Words_1,
	b.Nr_of_Words Nr_of_Words_2
	INTO #matches
	FROM #cleaned_table A
	JOIN #cleaned_table B
	ON (
	a.Clean_Name = b.Clean_Name
	OR
	(
	dbo.calcLevenshtein(a.Clean_Name, b.Clean_Name) > 90
	OR [dbo].calcJaroWinkler(a.Clean_Name, b.Clean_Name) > 0.9
	)
	)
	AND a.uid != b.uid;

	-- For each UID, create a group to which it belongs. This "Group_ID" is the minimum UID of all the matches this name has.
	DROP TABLE TMP_GROUPS;
	SELECT CASE WHEN uid < min(uid_2)
	THEN uid
	ELSE min(uid_2)
	END AS Group_ID,
	uid
	INTO TMP_GROUPS
	FROM #matches
	GROUP BY uid;

	-- The previous step will, in some cases, generate "orphaned items". E.g. in the case when A is matched with B, and B is matched with C,
	-- but C is not matched with A. In this case C will have to be matched with the group A.
	DECLARE @Number_of_Lone_Groups INT = (SELECT COUNT(1)
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.uid = Left.uid
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[uid_2] = Right.UID
	WHERE Left.Group_ID != Right.Group_ID)

	--While these orphans exist, find their parents, and update them with the correct Group ID:
	WHILE @Number_of_Lone_Groups != 0 BEGIN
	UPDATE Left
	SET Left.Group_ID =
	CASE WHEN Left.Group_ID > Right.Group_Id
	THEN Right.Group_Id
	ELSE Left.Group_ID
	END
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.[Organization_id] = Left.organization_id
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[organization_id_2] = Right.organization_id
	WHERE Left.Group_ID != Right.Group_ID


	UPDATE Right
	SET Right.Group_ID =
	CASE WHEN Left.Group_ID < Right.Group_Id
	THEN Left.Group_Id
	ELSE Right.Group_ID
	END
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.uid = Left.uid
	LEFT JOIN TMP_GROUPS Right
	ON #matches.uid_2 = Right.uid
	WHERE Left.Group_ID != Right.Group_ID

	SET @Number_of_Lone_Groups = (SELECT COUNT(1)
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.[Organization_id] = Left.organization_id
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[organization_id_2] = Right.organization_id
	WHERE Left.Group_ID != Right.Group_ID)
	END