Skip to content

Instantly share code, notes, and snippets.

@markusait
Created June 28, 2023 23:17
Show Gist options
  • Save markusait/91b87c954e49e2ba97d44d4c889d0386 to your computer and use it in GitHub Desktop.
Save markusait/91b87c954e49e2ba97d44d4c889d0386 to your computer and use it in GitHub Desktop.
Gist to compare different fuzzymatch name matching algorithms
import bag from "talisman/metrics/bag.js";
import damerauLevenshtein from "talisman/metrics/damerau-levenshtein.js";
import jaroWinkler from "talisman/metrics/jaro-winkler.js";
import lcs from "talisman/metrics/lcs.js";
import mlipns from "talisman/metrics/mlipns.js";
import identity from "talisman/metrics/identity.js";
import mongeElkan from "talisman/metrics/monge-elkan.js";
import mra from "talisman/metrics/mra.js";
import overlap from "talisman/metrics/overlap.js";
import stringSimilarity from "string-similarity";
import dice from "talisman/metrics/dice.js";
export const data = [
{
pt: "ABIGAIL JACKSON",
pl: "ABIGAIL SMITH JACKSON",
match: true,
},
{ pt: "ALEX BROWN", pl: "ALEXANDER P BROWN", match: true },
{ pt: "LAURA DAVIS", pl: "LAURA A DAVIS", match: true },
{ pt: "MELISSA TAYLOR", pl: "MELISSA B TAYLOR", match: true },
{ pt: "JESSICA ROBINSON", pl: "DANIEL O ROBINSON", match: false },
{ pt: "OLIVER JOHNSON", pl: "MR OLIVER F JOHNSON", match: true },
{ pt: "SOPHIA WHITE", pl: "SOPHIA WHITE", match: true },
{ pt: "GABRIELA GAB GARCIA", pl: "GABRIELA GARCIA", match: true },
{
pt: "JAMES HARRIS",
pl: "JAMES S HARRIS, MARILYN HARRIS",
match: true,
},
{ pt: "JOSEPH CLARK", pl: "JOSEPH F. CLARK IV", match: true },
{ pt: "ANTHONY RODRIGUEZ", pl: "ANTHONY P K RODRIGUEZ", match: true },
{ pt: "KENNETH LEWIS", pl: "KENNETH RAY LEWIS", match: true },
{ pt: "JULIAN MARTINEZ", pl: "JULIAN V MARTINEZ", match: true },
{
pt: "EMANUEL HERNANDEZ",
pl: "EMANUEL HERNANDEZ LOPEZ, EMANUEL HERNANDEZLOPEZ, SARAH GOMEZ",
match: true,
},
{ pt: "IAN YOUNG", pl: "IAN A YOUNG", match: true },
{ pt: "LINDEL KING", pl: "LINDEL A KING", match: true },
{ pt: "JOHN SCOTT", pl: "JOHN M SCOTT", match: true },
{
pt: "HUNTER EVANS",
pl: "HUNTER M EVANS, JAMES L EVANS",
match: true,
},
{ pt: "KYLE COLLINS", pl: "KYLE A COLLINS", match: true },
{
pt: "ADAM BELL",
pl: "ADAM G BELL, ERIN J BELL",
match: true,
},
{ pt: "PING-WEI LIU", pl: "YU YIN CHEN", match: false },
{ pt: "TROY WALKER", pl: "TROY B WALKER", match: true },
{ pt: "MARIO SAINT LUCAS", pl: "MARIO ST LUCAS", match: true },
{ pt: "ANNIE HILL", pl: "ANNIE SOK HILL", match: true },
{ pt: "MATTHEW TURNER", pl: "MATTHEW J TURNER", match: true },
{ pt: "PHUONG TRAN", pl: "PHUONG H TRAN", match: true },
{
pt: "ELIAS ANDERSON",
pl: "ELIAS JOHNSON ANDERSON",
match: true,
},
{ pt: "CHRISTY JACKSON", pl: "CHRISTINA L JACKSON", match: true },
{ pt: "LUCIA MILLER", pl: "LUCIA B MILLER", match: true },
{ pt: "MICHEAL MONROE", pl: "MICHEAL B MONROE", match: true },
{ pt: "JACKIE PEARSON", pl: "DANIEL O PEARSON", match: false },
{ pt: "DAVID PRICE", pl: "MR DAVID F PRICE", match: true },
{ pt: "STEVEN DAVIS", pl: "STEVEN DAVIS", match: true },
{ pt: "GRACIE GRIFFIN", pl: "GRACIE GRIFFIN", match: true },
{ pt: "JAMES SANDERS", pl: "JAMES S SANDERS, MARIANNA SANDERS", match: true },
{ pt: "JOSEPH HARRIS", pl: "JOSEPH F. HARRIS IV", match: true },
{ pt: "ANTONIA STEWART", pl: "ANTONIA P K STEWART", match: true },
{ pt: "KENNETH DAWSON", pl: "KENNETH RAY DAWSON", match: true },
{ pt: "JULIAN MORGAN", pl: "JULIAN V MORGAN", match: true },
{
pt: "EMMA EVANS",
pl: "EMMA EVANS LAMB, EMMA EVANSLAMB, SARA GARCIA",
match: true,
},
{ pt: "IAN DAWSON", pl: "IAN A DAWSON", match: true },
{ pt: "LINDA GOODMAN", pl: "LINDA A GOODMAN", match: true },
{ pt: "JOHN PARKER", pl: "JOHN M PARKER", match: true },
{ pt: "HUNTER COOPER", pl: "HUNTER M COOPER, JAMES L COOPER", match: true },
{ pt: "KYLE WILSON", pl: "KYLE A WILSON", match: true },
{ pt: "ADAM RICHARDS", pl: "ADAM G RICHARDS, ERIN J RICHARDS", match: true },
{ pt: "PING-WEI HUANG", pl: "YU YIN LEE", match: false },
{ pt: "TROY HUDSON", pl: "TROY B HUDSON", match: true },
{ pt: "MARIO ST PETER", pl: "MARIO ST PETER", match: true },
{ pt: "ANNIE HAMILTON", pl: "ANNIE SOK HAMILTON", match: true },
{ pt: "MATTHEW GRIFFIN", pl: "MATTHEW J GRIFFIN", match: true },
{ pt: "PHOEBE NEWMAN", pl: "PHOEBE H NEWMAN", match: true },
{ pt: "GERALD CROSBY", pl: "GERALD J CROSBY", match: true },
{ pt: "ALAN CARTER", pl: "ALAN L CARTER", match: true },
{ pt: "SHARON WRIGHT", pl: "SHARON A WRIGHT", match: true },
{ pt: "KATIE RAMIREZ", pl: "KATIE M RAMIREZ", match: true },
{ pt: "WILLIAM DAVIS", pl: "WILLIAM J DAVIS", match: true },
{ pt: "STEPHEN ADAMS", pl: "STEPHEN P ADAMS", match: true },
{ pt: "LINDA THOMPSON", pl: "LINDA A THOMPSON", match: true },
{ pt: "ANGELA ROBINSON", pl: "ANGELA L ROBINSON", match: true },
{ pt: "JACKIE GIBSON", pl: "JACKIE M GIBSON", match: true },
{ pt: "MICHAEL BROWN", pl: "MICHAEL L BROWN", match: true },
{ pt: "TOMMY MORGAN", pl: "TOMMY J MORGAN", match: true },
{ pt: "STEVE TAYLOR", pl: "STEVE A TAYLOR", match: true },
{ pt: "DAVID WILSON", pl: "DAVID L WILSON", match: true },
{ pt: "ROBERT JACKSON", pl: "ROBERT A JACKSON", match: true },
{ pt: "SAMUEL RODRIGUEZ", pl: "SAMUEL L RODRIGUEZ", match: true },
{ pt: "JOHN MARTINEZ", pl: "JOHN L MARTINEZ", match: true },
{ pt: "JAMES WHITE", pl: "JAMES A WHITE", match: true },
{ pt: "DANIEL GREEN", pl: "DANIEL L GREEN", match: true },
{ pt: "JENNIFER HARRIS", pl: "JENNIFER A HARRIS", match: true },
{ pt: "BRIAN LEWIS", pl: "BRIAN L LEWIS", match: true },
{ pt: "JERRY KING", pl: "JERRY L KING", match: true },
{ pt: "SARAH SCOTT", pl: "SARAH L SCOTT", match: true },
{ pt: "PATRICIA JONES", pl: "PATRICIA L JONES", match: true },
{ pt: "EDWARD NELSON", pl: "EDWARD A NELSON", match: true },
{ pt: "KEVIN HALL", pl: "KEVIN L HALL", match: true },
{ pt: "JESSICA YOUNG", pl: "JESSICA L YOUNG", match: true },
{ pt: "MICHELLE WALKER", pl: "MICHELLE A WALKER", match: true },
{ pt: "RONALD ALLEN", pl: "RONALD L ALLEN", match: true },
{ pt: "KAREN WRIGHT", pl: "KAREN L WRIGHT", match: true },
{ pt: "JEFFREY ROBERTS", pl: "JEFFREY L ROBERTS", match: true },
];
const bagResGet = (pt, pl, match) => {
const threshold = 4;
const score = bag(pt, pl);
const res = score < threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const damerauLevenshteinResGet = (pt, pl, match) => {
const threshold = 6;
const score = damerauLevenshtein(pt, pl);
const res = score < threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const jaroWinkletResGet = (pt, pl, match) => {
const threshold = 0.8;
const score = jaroWinkler(pt, pl);
const res = score > threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const lcsResGet = (pt, pl, match) => {
const threshold = 0.1;
const score = lcs.distance(pt, pl);
const res = score > threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const mlipnsResGet = (pt, pl, match) => {
const score = mlipns(pt, pl);
const res = score === 1 ? true : false;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const mongeElkanResGet = (pt, pl, match) => {
const threshold = 0.8;
const score = mongeElkan.symmetric(identity.similarity, pt, pl);
const res = score > threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const mraResGet = (pt, pl, match) => {
const score = mra(pt, pl).matching;
const res = score;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const overlapResGet = (pt, pl, match) => {
const score = overlap(pt, pl);
const res = score === 1 ? true : false;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const stringSimilarityResGet = (pt, pl, match) => {
const threshold = 0.8;
const score = stringSimilarity.compareTwoStrings(pt, pl);
const res = score > threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
const diceResGet = (pt, pl, match) => {
const threshold = 0.8;
const score = dice(pt, pl);
const res = score > threshold;
return {
hits: res === match ? 1 : 0,
wrongTrues: match === true && res === false,
wrongFalses: match === false && res === true,
};
};
// helper func { a: 1, b: 2 } + {a: 2, b: 3 } = { a: 3, b: 5}
const sumObjectsByKey = (...objs) => {
const res = objs.reduce((a, b) => {
for (let k in b) {
if (k === "name") {
a[k] = b[k];
} else if (b.hasOwnProperty(k)) {
a[k] = (a[k] || 0) + b[k];
}
}
return a;
}, {});
return res;
};
const compare = () => {
const scores = {
baScores: {
name: "Bag ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
dlScores: {
name: "Damerau Levenshtein ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
jwScores: {
name: "Jaro-Winkler ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
lcScores: {
name: "LCS ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
mlScores: {
name: "Mlipns ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
meScores: {
name: "MongeElkan ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
mrScores: {
name: "MRA ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
ovScores: {
name: "Overlap ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
ssScores: {
name: "String-Similarity ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
diScores: {
name: "Dice ",
hits: 0,
wrongTrues: 0,
wrongFalses: 0,
},
};
data.forEach((d, i) => {
const pt = d.pt;
const pl = d.pl;
const match = d.match;
scores.baScores = sumObjectsByKey(
scores.baScores,
bagResGet(pt, pl, match)
);
scores.dlScores = sumObjectsByKey(
scores.dlScores,
damerauLevenshteinResGet(pt, pl, match)
);
scores.jwScores = sumObjectsByKey(
scores.jwScores,
jaroWinkletResGet(pt, pl, match)
);
scores.lcScores = sumObjectsByKey(
scores.lcScores,
lcsResGet(pt, pl, match)
);
scores.mlScores = sumObjectsByKey(
scores.mlScores,
mlipnsResGet(pt, pl, match)
);
scores.meScores = sumObjectsByKey(
scores.meScores,
mongeElkanResGet(pt, pl, match)
);
scores.mrScores = sumObjectsByKey(
scores.mrScores,
mraResGet(pt, pl, match)
);
scores.ovScores = sumObjectsByKey(
scores.ovScores,
overlapResGet(pt, pl, match)
);
scores.ssScores = sumObjectsByKey(
scores.ssScores,
stringSimilarityResGet(pt, pl, match)
);
scores.diScores = sumObjectsByKey(
scores.diScores,
diceResGet(pt, pl, match)
);
});
const scoresPrint = [];
for (const algo in scores) {
// scoresPrint.push(`${name} overall match rate ${(normalMatch + scores[name].hits) / allACHAccounts }, wrongTrues: ${scores[name].wrongTrues} wrongFalses: ${scores[name].wrongFalses} `)
scoresPrint.push(
`${scores[algo].name} fail match rate ${parseFloat(
scores[algo].hits / data.length
).toFixed(4)}%, falsePositives: ${
scores[algo].wrongTrues
} falseNegatives: ${scores[algo].wrongFalses} `
);
}
return scoresPrint;
};
console.log(compare());
// "Bag fail match rate 0.7195%, falsePositives: 23 falseNegatives: 0 ",
// "Damerau Levenshtein fail match rate 0.8415%, falsePositives: 13 falseNegatives: 0 ",
// "Jaro-Winkler fail match rate 0.9878%, falsePositives: 1 falseNegatives: 0 ",
// "LCS fail match rate 0.9146%, falsePositives: 3 falseNegatives: 4 ",
// "Mlipns fail match rate 0.0854%, falsePositives: 75 falseNegatives: 0 ",
// "MongeElkan fail match rate 0.9756%, falsePositives: 0 falseNegatives: 2 ",
// "MRA fail match rate 0.9756%, falsePositives: 0 falseNegatives: 2 ",
// "Overlap fail match rate 0.9878%, falsePositives: 1 falseNegatives: 0 ",
// "String-Similarity fail match rate 0.7683%, falsePositives: 19 falseNegatives: 0 ",
// "Dice fail match rate 0.8415%, falsePositives: 13 falseNegatives: 0 ";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment