Skip to content

Instantly share code, notes, and snippets.

@scpike
Last active November 23, 2015 03:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scpike/1d21ec18b2baecad5689 to your computer and use it in GitHub Desktop.
Save scpike/1d21ec18b2baecad5689 to your computer and use it in GitHub Desktop.
deduplicate some names
// http://stackoverflow.com/questions/18233874/get-all-the-combinations-of-n-elements-of-multidimensional-array
function getCombinations(arr, n) {
if(n == 1)
{
var ret = [];
for(var i = 0; i < arr.length; i++)
{
ret.push([arr[i]]);
}
return ret;
}
else
{
var ret = [];
for(var i = 0; i <= arr.length; i++)
{
var elem = arr.shift();
var childperm = getCombinations(arr.slice(), n-1);
for(var k = 0; k < childperm.length; k++)
{
ret.push([elem].concat(childperm[k]));
}
}
return ret;
}
}
// https://gist.github.com/inactivist/7614182
function shingle(collection, size) {
var shingles = new Set();
for (var i=0; i<collection.length-size+1; i++) {
shingles.add(collection.slice(i, i+size));
}
return shingles;
}
function jaccard_coefficient(s1, s2) {
s1_shingles = shingle(s1, 2);
s2_shingles = shingle(s2, 2);
var union = new Set([...s1_shingles, ...s2_shingles]);
intersection = new Set(
[...s1_shingles].filter(x => s2_shingles.has(x)));
return (intersection.size / union.size);
}
function strip_special_characters(string){
var r=string.toLowerCase();
r = r.replace(new RegExp("[àáâãäå]", 'g'),"a");
r = r.replace(new RegExp("æ", 'g'),"ae");
r = r.replace(new RegExp("ç", 'g'),"c");
r = r.replace(new RegExp("[èéêë]", 'g'),"e");
r = r.replace(new RegExp("[ìíîï]", 'g'),"i");
r = r.replace(new RegExp("ñ", 'g'),"n");
r = r.replace(new RegExp("[òóôõö]", 'g'),"o");
r = r.replace(new RegExp("œ", 'g'),"oe");
r = r.replace(new RegExp("[ùúûü]", 'g'),"u");
r = r.replace(new RegExp("[ýÿ]", 'g'),"y");
return r;
}
function remove_punct(s) {
return s.replace(".", " ").replace("&", " ").replace('"', '');
}
function remove_tokens(s) {
return s.replace(/\b(llc|inc|ltd|pte|intl|gmbh|corp|corporation|company|co|sa|sl|winery|wines|bodega|slu|vineyard|winework|cellar|the)\b/, '', 'g')
}
function normalize(s) {
return remove_tokens(strip_special_characters(remove_punct(s.toLowerCase()))).replace(/\s\s*/, ' ').trim();
}
function block(s) {
return s.replace(/[^A-z0-0]/, '', 'g').substring(0,2).toLowerCase();
}
function numeric_part(s) {
return s.replace(/[^0-9]/, '', 'g');
}
(function(x) {
var blocks = {};
var words = x.trim().split("\n");
var results = [];
words.forEach(function(e) {
blocks[block(e)] = blocks[block(e)] || [];
blocks[block(e)].push(e);
})
for (var key in blocks) {
if (blocks.hasOwnProperty(key)) {
var blockGroup = blocks[key];
var pairs = getCombinations(blockGroup, 2);
pairs.forEach(function(e) {
if (numeric_part(e[0]) === numeric_part(e[1])) {
var diff = jaccard_coefficient(normalize(e[0]), normalize(e[1]));
if (diff > 0.5) {
results.push([diff, e[0], e[1]]);
}
}
});
}
}
return results.sort().reverse().join("\n")
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment