Created
June 28, 2023 23:17
-
-
Save markusait/91b87c954e49e2ba97d44d4c889d0386 to your computer and use it in GitHub Desktop.
Gist to compare different fuzzymatch name matching algorithms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bag from "talisman/metrics/bag.js"; | |
import damerauLevenshtein from "talisman/metrics/damerau-levenshtein.js"; | |
import jaroWinkler from "talisman/metrics/jaro-winkler.js"; | |
import lcs from "talisman/metrics/lcs.js"; | |
import mlipns from "talisman/metrics/mlipns.js"; | |
import identity from "talisman/metrics/identity.js"; | |
import mongeElkan from "talisman/metrics/monge-elkan.js"; | |
import mra from "talisman/metrics/mra.js"; | |
import overlap from "talisman/metrics/overlap.js"; | |
import stringSimilarity from "string-similarity"; | |
import dice from "talisman/metrics/dice.js"; | |
export const data = [ | |
{ | |
pt: "ABIGAIL JACKSON", | |
pl: "ABIGAIL SMITH JACKSON", | |
match: true, | |
}, | |
{ pt: "ALEX BROWN", pl: "ALEXANDER P BROWN", match: true }, | |
{ pt: "LAURA DAVIS", pl: "LAURA A DAVIS", match: true }, | |
{ pt: "MELISSA TAYLOR", pl: "MELISSA B TAYLOR", match: true }, | |
{ pt: "JESSICA ROBINSON", pl: "DANIEL O ROBINSON", match: false }, | |
{ pt: "OLIVER JOHNSON", pl: "MR OLIVER F JOHNSON", match: true }, | |
{ pt: "SOPHIA WHITE", pl: "SOPHIA WHITE", match: true }, | |
{ pt: "GABRIELA GAB GARCIA", pl: "GABRIELA GARCIA", match: true }, | |
{ | |
pt: "JAMES HARRIS", | |
pl: "JAMES S HARRIS, MARILYN HARRIS", | |
match: true, | |
}, | |
{ pt: "JOSEPH CLARK", pl: "JOSEPH F. CLARK IV", match: true }, | |
{ pt: "ANTHONY RODRIGUEZ", pl: "ANTHONY P K RODRIGUEZ", match: true }, | |
{ pt: "KENNETH LEWIS", pl: "KENNETH RAY LEWIS", match: true }, | |
{ pt: "JULIAN MARTINEZ", pl: "JULIAN V MARTINEZ", match: true }, | |
{ | |
pt: "EMANUEL HERNANDEZ", | |
pl: "EMANUEL HERNANDEZ LOPEZ, EMANUEL HERNANDEZLOPEZ, SARAH GOMEZ", | |
match: true, | |
}, | |
{ pt: "IAN YOUNG", pl: "IAN A YOUNG", match: true }, | |
{ pt: "LINDEL KING", pl: "LINDEL A KING", match: true }, | |
{ pt: "JOHN SCOTT", pl: "JOHN M SCOTT", match: true }, | |
{ | |
pt: "HUNTER EVANS", | |
pl: "HUNTER M EVANS, JAMES L EVANS", | |
match: true, | |
}, | |
{ pt: "KYLE COLLINS", pl: "KYLE A COLLINS", match: true }, | |
{ | |
pt: "ADAM BELL", | |
pl: "ADAM G BELL, ERIN J BELL", | |
match: true, | |
}, | |
{ pt: "PING-WEI LIU", pl: "YU YIN CHEN", match: false }, | |
{ pt: "TROY WALKER", pl: "TROY B WALKER", match: true }, | |
{ pt: "MARIO SAINT LUCAS", pl: "MARIO ST LUCAS", match: true }, | |
{ pt: "ANNIE HILL", pl: "ANNIE SOK HILL", match: true }, | |
{ pt: "MATTHEW TURNER", pl: "MATTHEW J TURNER", match: true }, | |
{ pt: "PHUONG TRAN", pl: "PHUONG H TRAN", match: true }, | |
{ | |
pt: "ELIAS ANDERSON", | |
pl: "ELIAS JOHNSON ANDERSON", | |
match: true, | |
}, | |
{ pt: "CHRISTY JACKSON", pl: "CHRISTINA L JACKSON", match: true }, | |
{ pt: "LUCIA MILLER", pl: "LUCIA B MILLER", match: true }, | |
{ pt: "MICHEAL MONROE", pl: "MICHEAL B MONROE", match: true }, | |
{ pt: "JACKIE PEARSON", pl: "DANIEL O PEARSON", match: false }, | |
{ pt: "DAVID PRICE", pl: "MR DAVID F PRICE", match: true }, | |
{ pt: "STEVEN DAVIS", pl: "STEVEN DAVIS", match: true }, | |
{ pt: "GRACIE GRIFFIN", pl: "GRACIE GRIFFIN", match: true }, | |
{ pt: "JAMES SANDERS", pl: "JAMES S SANDERS, MARIANNA SANDERS", match: true }, | |
{ pt: "JOSEPH HARRIS", pl: "JOSEPH F. HARRIS IV", match: true }, | |
{ pt: "ANTONIA STEWART", pl: "ANTONIA P K STEWART", match: true }, | |
{ pt: "KENNETH DAWSON", pl: "KENNETH RAY DAWSON", match: true }, | |
{ pt: "JULIAN MORGAN", pl: "JULIAN V MORGAN", match: true }, | |
{ | |
pt: "EMMA EVANS", | |
pl: "EMMA EVANS LAMB, EMMA EVANSLAMB, SARA GARCIA", | |
match: true, | |
}, | |
{ pt: "IAN DAWSON", pl: "IAN A DAWSON", match: true }, | |
{ pt: "LINDA GOODMAN", pl: "LINDA A GOODMAN", match: true }, | |
{ pt: "JOHN PARKER", pl: "JOHN M PARKER", match: true }, | |
{ pt: "HUNTER COOPER", pl: "HUNTER M COOPER, JAMES L COOPER", match: true }, | |
{ pt: "KYLE WILSON", pl: "KYLE A WILSON", match: true }, | |
{ pt: "ADAM RICHARDS", pl: "ADAM G RICHARDS, ERIN J RICHARDS", match: true }, | |
{ pt: "PING-WEI HUANG", pl: "YU YIN LEE", match: false }, | |
{ pt: "TROY HUDSON", pl: "TROY B HUDSON", match: true }, | |
{ pt: "MARIO ST PETER", pl: "MARIO ST PETER", match: true }, | |
{ pt: "ANNIE HAMILTON", pl: "ANNIE SOK HAMILTON", match: true }, | |
{ pt: "MATTHEW GRIFFIN", pl: "MATTHEW J GRIFFIN", match: true }, | |
{ pt: "PHOEBE NEWMAN", pl: "PHOEBE H NEWMAN", match: true }, | |
{ pt: "GERALD CROSBY", pl: "GERALD J CROSBY", match: true }, | |
{ pt: "ALAN CARTER", pl: "ALAN L CARTER", match: true }, | |
{ pt: "SHARON WRIGHT", pl: "SHARON A WRIGHT", match: true }, | |
{ pt: "KATIE RAMIREZ", pl: "KATIE M RAMIREZ", match: true }, | |
{ pt: "WILLIAM DAVIS", pl: "WILLIAM J DAVIS", match: true }, | |
{ pt: "STEPHEN ADAMS", pl: "STEPHEN P ADAMS", match: true }, | |
{ pt: "LINDA THOMPSON", pl: "LINDA A THOMPSON", match: true }, | |
{ pt: "ANGELA ROBINSON", pl: "ANGELA L ROBINSON", match: true }, | |
{ pt: "JACKIE GIBSON", pl: "JACKIE M GIBSON", match: true }, | |
{ pt: "MICHAEL BROWN", pl: "MICHAEL L BROWN", match: true }, | |
{ pt: "TOMMY MORGAN", pl: "TOMMY J MORGAN", match: true }, | |
{ pt: "STEVE TAYLOR", pl: "STEVE A TAYLOR", match: true }, | |
{ pt: "DAVID WILSON", pl: "DAVID L WILSON", match: true }, | |
{ pt: "ROBERT JACKSON", pl: "ROBERT A JACKSON", match: true }, | |
{ pt: "SAMUEL RODRIGUEZ", pl: "SAMUEL L RODRIGUEZ", match: true }, | |
{ pt: "JOHN MARTINEZ", pl: "JOHN L MARTINEZ", match: true }, | |
{ pt: "JAMES WHITE", pl: "JAMES A WHITE", match: true }, | |
{ pt: "DANIEL GREEN", pl: "DANIEL L GREEN", match: true }, | |
{ pt: "JENNIFER HARRIS", pl: "JENNIFER A HARRIS", match: true }, | |
{ pt: "BRIAN LEWIS", pl: "BRIAN L LEWIS", match: true }, | |
{ pt: "JERRY KING", pl: "JERRY L KING", match: true }, | |
{ pt: "SARAH SCOTT", pl: "SARAH L SCOTT", match: true }, | |
{ pt: "PATRICIA JONES", pl: "PATRICIA L JONES", match: true }, | |
{ pt: "EDWARD NELSON", pl: "EDWARD A NELSON", match: true }, | |
{ pt: "KEVIN HALL", pl: "KEVIN L HALL", match: true }, | |
{ pt: "JESSICA YOUNG", pl: "JESSICA L YOUNG", match: true }, | |
{ pt: "MICHELLE WALKER", pl: "MICHELLE A WALKER", match: true }, | |
{ pt: "RONALD ALLEN", pl: "RONALD L ALLEN", match: true }, | |
{ pt: "KAREN WRIGHT", pl: "KAREN L WRIGHT", match: true }, | |
{ pt: "JEFFREY ROBERTS", pl: "JEFFREY L ROBERTS", match: true }, | |
]; | |
const bagResGet = (pt, pl, match) => { | |
const threshold = 4; | |
const score = bag(pt, pl); | |
const res = score < threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const damerauLevenshteinResGet = (pt, pl, match) => { | |
const threshold = 6; | |
const score = damerauLevenshtein(pt, pl); | |
const res = score < threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const jaroWinkletResGet = (pt, pl, match) => { | |
const threshold = 0.8; | |
const score = jaroWinkler(pt, pl); | |
const res = score > threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const lcsResGet = (pt, pl, match) => { | |
const threshold = 0.1; | |
const score = lcs.distance(pt, pl); | |
const res = score > threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const mlipnsResGet = (pt, pl, match) => { | |
const score = mlipns(pt, pl); | |
const res = score === 1 ? true : false; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const mongeElkanResGet = (pt, pl, match) => { | |
const threshold = 0.8; | |
const score = mongeElkan.symmetric(identity.similarity, pt, pl); | |
const res = score > threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const mraResGet = (pt, pl, match) => { | |
const score = mra(pt, pl).matching; | |
const res = score; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const overlapResGet = (pt, pl, match) => { | |
const score = overlap(pt, pl); | |
const res = score === 1 ? true : false; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const stringSimilarityResGet = (pt, pl, match) => { | |
const threshold = 0.8; | |
const score = stringSimilarity.compareTwoStrings(pt, pl); | |
const res = score > threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
const diceResGet = (pt, pl, match) => { | |
const threshold = 0.8; | |
const score = dice(pt, pl); | |
const res = score > threshold; | |
return { | |
hits: res === match ? 1 : 0, | |
wrongTrues: match === true && res === false, | |
wrongFalses: match === false && res === true, | |
}; | |
}; | |
// helper func { a: 1, b: 2 } + {a: 2, b: 3 } = { a: 3, b: 5} | |
const sumObjectsByKey = (...objs) => { | |
const res = objs.reduce((a, b) => { | |
for (let k in b) { | |
if (k === "name") { | |
a[k] = b[k]; | |
} else if (b.hasOwnProperty(k)) { | |
a[k] = (a[k] || 0) + b[k]; | |
} | |
} | |
return a; | |
}, {}); | |
return res; | |
}; | |
const compare = () => { | |
const scores = { | |
baScores: { | |
name: "Bag ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
dlScores: { | |
name: "Damerau Levenshtein ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
jwScores: { | |
name: "Jaro-Winkler ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
lcScores: { | |
name: "LCS ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
mlScores: { | |
name: "Mlipns ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
meScores: { | |
name: "MongeElkan ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
mrScores: { | |
name: "MRA ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
ovScores: { | |
name: "Overlap ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
ssScores: { | |
name: "String-Similarity ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
diScores: { | |
name: "Dice ", | |
hits: 0, | |
wrongTrues: 0, | |
wrongFalses: 0, | |
}, | |
}; | |
data.forEach((d, i) => { | |
const pt = d.pt; | |
const pl = d.pl; | |
const match = d.match; | |
scores.baScores = sumObjectsByKey( | |
scores.baScores, | |
bagResGet(pt, pl, match) | |
); | |
scores.dlScores = sumObjectsByKey( | |
scores.dlScores, | |
damerauLevenshteinResGet(pt, pl, match) | |
); | |
scores.jwScores = sumObjectsByKey( | |
scores.jwScores, | |
jaroWinkletResGet(pt, pl, match) | |
); | |
scores.lcScores = sumObjectsByKey( | |
scores.lcScores, | |
lcsResGet(pt, pl, match) | |
); | |
scores.mlScores = sumObjectsByKey( | |
scores.mlScores, | |
mlipnsResGet(pt, pl, match) | |
); | |
scores.meScores = sumObjectsByKey( | |
scores.meScores, | |
mongeElkanResGet(pt, pl, match) | |
); | |
scores.mrScores = sumObjectsByKey( | |
scores.mrScores, | |
mraResGet(pt, pl, match) | |
); | |
scores.ovScores = sumObjectsByKey( | |
scores.ovScores, | |
overlapResGet(pt, pl, match) | |
); | |
scores.ssScores = sumObjectsByKey( | |
scores.ssScores, | |
stringSimilarityResGet(pt, pl, match) | |
); | |
scores.diScores = sumObjectsByKey( | |
scores.diScores, | |
diceResGet(pt, pl, match) | |
); | |
}); | |
const scoresPrint = []; | |
for (const algo in scores) { | |
// scoresPrint.push(`${name} overall match rate ${(normalMatch + scores[name].hits) / allACHAccounts }, wrongTrues: ${scores[name].wrongTrues} wrongFalses: ${scores[name].wrongFalses} `) | |
scoresPrint.push( | |
`${scores[algo].name} fail match rate ${parseFloat( | |
scores[algo].hits / data.length | |
).toFixed(4)}%, falsePositives: ${ | |
scores[algo].wrongTrues | |
} falseNegatives: ${scores[algo].wrongFalses} ` | |
); | |
} | |
return scoresPrint; | |
}; | |
console.log(compare()); | |
// "Bag fail match rate 0.7195%, falsePositives: 23 falseNegatives: 0 ", | |
// "Damerau Levenshtein fail match rate 0.8415%, falsePositives: 13 falseNegatives: 0 ", | |
// "Jaro-Winkler fail match rate 0.9878%, falsePositives: 1 falseNegatives: 0 ", | |
// "LCS fail match rate 0.9146%, falsePositives: 3 falseNegatives: 4 ", | |
// "Mlipns fail match rate 0.0854%, falsePositives: 75 falseNegatives: 0 ", | |
// "MongeElkan fail match rate 0.9756%, falsePositives: 0 falseNegatives: 2 ", | |
// "MRA fail match rate 0.9756%, falsePositives: 0 falseNegatives: 2 ", | |
// "Overlap fail match rate 0.9878%, falsePositives: 1 falseNegatives: 0 ", | |
// "String-Similarity fail match rate 0.7683%, falsePositives: 19 falseNegatives: 0 ", | |
// "Dice fail match rate 0.8415%, falsePositives: 13 falseNegatives: 0 "; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment