Skip to content

Instantly share code, notes, and snippets.

@andrewroberts
Created September 8, 2020 09:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewroberts/4e848b9670137a2142361e21df7a41d1 to your computer and use it in GitHub Desktop.
Save andrewroberts/4e848b9670137a2142361e21df7a41d1 to your computer and use it in GitHub Desktop.
FuzzySet (https://glench.github.io/fuzzyset.js/) with a bit of re-packaging for use with Google Apps Script
/**
* @param {string} value - value to test
* @param {array} valuesToTest
*
* @return the best match in the test array
*/
function getBestMatch(value, valuesToTest) {
var results = {}
var bestScoreoFar = 0
var bestMatch = ''
valuesToTest.forEach(function(nextValueToTest) {
var nextScore = getMatchValue(value, nextValueToTest)
if (nextScore > bestScoreoFar) {
bestScoreoFar = nextScore
bestMatch = nextValueToTest
}
})
Logger.log('Best match: ' + bestMatch + ' (' + bestScoreoFar + ')')
return bestMatch
}
/**
* Get match percentage
*
* @param {String} firstString
* @param {String} secondString
*
* @return {Number} match as value from 0 to 1
*/
function getMatchValue(firstString, secondString) {
var fuzzy = FuzzySet([firstString]);
var match = fuzzy.get(secondString);
if (match !== null) {
return match[0][0];
} else {
return null;
}
}
// https://glench.github.io/fuzzyset.js/
(function() {
var FuzzySet = function(arr, useLevenshtein, gramSizeLower, gramSizeUpper) {
var fuzzyset = {
};
// default options
arr = arr || [];
fuzzyset.gramSizeLower = gramSizeLower || 2;
fuzzyset.gramSizeUpper = gramSizeUpper || 3;
fuzzyset.useLevenshtein = (typeof useLevenshtein !== 'boolean') ? true : useLevenshtein;
// define all the object functions and attributes
fuzzyset.exactSet = {};
fuzzyset.matchDict = {};
fuzzyset.items = {};
// helper functions
var levenshtein = function(str1, str2) {
var current = [], prev, value;
for (var i = 0; i <= str2.length; i++)
for (var j = 0; j <= str1.length; j++) {
if (i && j)
if (str1.charAt(j - 1) === str2.charAt(i - 1))
value = prev;
else
value = Math.min(current[j], current[j - 1], prev) + 1;
else
value = i + j;
prev = current[j];
current[j] = value;
}
return current.pop();
};
// return an edit distance from 0 to 1
var _distance = function(str1, str2) {
if (str1 === null && str2 === null) throw 'Trying to compare two null values';
if (str1 === null || str2 === null) return 0;
str1 = String(str1); str2 = String(str2);
var distance = levenshtein(str1, str2);
if (str1.length > str2.length) {
return 1 - distance / str1.length;
} else {
return 1 - distance / str2.length;
}
};
var _nonWordRe = /[^a-zA-Z0-9\u00C0-\u00FF, ]+/g;
var _iterateGrams = function(value, gramSize) {
gramSize = gramSize || 2;
var simplified = '-' + value.toLowerCase().replace(_nonWordRe, '') + '-',
lenDiff = gramSize - simplified.length,
results = [];
if (lenDiff > 0) {
for (var i = 0; i < lenDiff; ++i) {
simplified += '-';
}
}
for (var i = 0; i < simplified.length - gramSize + 1; ++i) {
results.push(simplified.slice(i, i + gramSize));
}
return results;
};
var _gramCounter = function(value, gramSize) {
// return an object where key=gram, value=number of occurrences
gramSize = gramSize || 2;
var result = {},
grams = _iterateGrams(value, gramSize),
i = 0;
for (i; i < grams.length; ++i) {
if (grams[i] in result) {
result[grams[i]] += 1;
} else {
result[grams[i]] = 1;
}
}
return result;
};
// the main functions
fuzzyset.get = function(value, defaultValue, minMatchScore) {
// check for value in set, returning defaultValue or null if none found
if (minMatchScore === undefined) {
minMatchScore = .33
}
var result = this._get(value, minMatchScore);
if (!result && typeof defaultValue !== 'undefined') {
return defaultValue;
}
return result;
};
fuzzyset._get = function(value, minMatchScore) {
var normalizedValue = this._normalizeStr(value),
result = this.exactSet[normalizedValue];
if (result) {
return [[1, result]];
}
var results = [];
// start with high gram size and if there are no results, go to lower gram sizes
for (var gramSize = this.gramSizeUpper; gramSize >= this.gramSizeLower; --gramSize) {
results = this.__get(value, gramSize, minMatchScore);
if (results && results.length > 0) {
return results;
}
}
return null;
};
fuzzyset.__get = function(value, gramSize, minMatchScore) {
var normalizedValue = this._normalizeStr(value),
matches = {},
gramCounts = _gramCounter(normalizedValue, gramSize),
items = this.items[gramSize],
sumOfSquareGramCounts = 0,
gram,
gramCount,
i,
index,
otherGramCount;
for (gram in gramCounts) {
gramCount = gramCounts[gram];
sumOfSquareGramCounts += Math.pow(gramCount, 2);
if (gram in this.matchDict) {
for (i = 0; i < this.matchDict[gram].length; ++i) {
index = this.matchDict[gram][i][0];
otherGramCount = this.matchDict[gram][i][1];
if (index in matches) {
matches[index] += gramCount * otherGramCount;
} else {
matches[index] = gramCount * otherGramCount;
}
}
}
}
function isEmptyObject(obj) {
for(var prop in obj) {
if(obj.hasOwnProperty(prop))
return false;
}
return true;
}
if (isEmptyObject(matches)) {
return null;
}
var vectorNormal = Math.sqrt(sumOfSquareGramCounts),
results = [],
matchScore;
// build a results list of [score, str]
for (var matchIndex in matches) {
matchScore = matches[matchIndex];
results.push([matchScore / (vectorNormal * items[matchIndex][0]), items[matchIndex][1]]);
}
var sortDescending = function(a, b) {
if (a[0] < b[0]) {
return 1;
} else if (a[0] > b[0]) {
return -1;
} else {
return 0;
}
};
results.sort(sortDescending);
if (this.useLevenshtein) {
var newResults = [],
endIndex = Math.min(50, results.length);
// truncate somewhat arbitrarily to 50
for (var i = 0; i < endIndex; ++i) {
newResults.push([_distance(results[i][1], normalizedValue), results[i][1]]);
}
results = newResults;
results.sort(sortDescending);
}
var newResults = [];
results.forEach(function(scoreWordPair) {
if (scoreWordPair[0] >= minMatchScore) {
newResults.push([scoreWordPair[0], this.exactSet[scoreWordPair[1]]]);
}
}.bind(this))
return newResults;
};
fuzzyset.add = function(value) {
var normalizedValue = this._normalizeStr(value);
if (normalizedValue in this.exactSet) {
return false;
}
var i = this.gramSizeLower;
for (i; i < this.gramSizeUpper + 1; ++i) {
this._add(value, i);
}
};
fuzzyset._add = function(value, gramSize) {
var normalizedValue = this._normalizeStr(value),
items = this.items[gramSize] || [],
index = items.length;
items.push(0);
var gramCounts = _gramCounter(normalizedValue, gramSize),
sumOfSquareGramCounts = 0,
gram, gramCount;
for (gram in gramCounts) {
gramCount = gramCounts[gram];
sumOfSquareGramCounts += Math.pow(gramCount, 2);
if (gram in this.matchDict) {
this.matchDict[gram].push([index, gramCount]);
} else {
this.matchDict[gram] = [[index, gramCount]];
}
}
var vectorNormal = Math.sqrt(sumOfSquareGramCounts);
items[index] = [vectorNormal, normalizedValue];
this.items[gramSize] = items;
this.exactSet[normalizedValue] = value;
};
fuzzyset._normalizeStr = function(str) {
if (Object.prototype.toString.call(str) !== '[object String]') throw 'Must use a string as argument to FuzzySet functions';
return str.toLowerCase();
};
// return length of items in set
fuzzyset.length = function() {
var count = 0,
prop;
for (prop in this.exactSet) {
if (this.exactSet.hasOwnProperty(prop)) {
count += 1;
}
}
return count;
};
// return is set is empty
fuzzyset.isEmpty = function() {
for (var prop in this.exactSet) {
if (this.exactSet.hasOwnProperty(prop)) {
return false;
}
}
return true;
};
// return list of values loaded into set
fuzzyset.values = function() {
var values = [],
prop;
for (prop in this.exactSet) {
if (this.exactSet.hasOwnProperty(prop)) {
values.push(this.exactSet[prop]);
}
}
return values;
};
// initialization
var i = fuzzyset.gramSizeLower;
for (i; i < fuzzyset.gramSizeUpper + 1; ++i) {
fuzzyset.items[i] = [];
}
// add all the items to the set
for (i = 0; i < arr.length; ++i) {
fuzzyset.add(arr[i]);
}
return fuzzyset;
};
var root = this;
// Export the fuzzyset object for **CommonJS**, with backwards-compatibility
// for the old `require()` API. If we're not in CommonJS, add `_` to the
// global object.
if (typeof module !== 'undefined' && module.exports) {
module.exports = FuzzySet;
if(root)
{
root.FuzzySet = FuzzySet;
}
} else {
root.FuzzySet = FuzzySet;
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment