Skip to content

Instantly share code, notes, and snippets.

@gustavonovaes
Created May 1, 2023 15:37
Show Gist options
  • Save gustavonovaes/7bd983bf1b426ee827d7bbcf06b10585 to your computer and use it in GitHub Desktop.
Save gustavonovaes/7bd983bf1b426ee827d7bbcf06b10585 to your computer and use it in GitHub Desktop.
Levenshtein Distance vs Cosine Similarity
const data = [
["The quick brown fox jumps", ""],
["", "The quick brown box jumps"],
["The quick brown fox jumps", " "],
[" ", "The quick brown box jumps"],
["The quick brown fox jumps", " "],
[" ", "The quick brown box jumps"],
["The quick brown fox jumps", "The quick brown box jumps"],
["The quick brown fox jumps", "The fast brown dog jumps"],
["The red car drives fast", "The blue car drives slow"],
["The old man walks slowly", "The young boy walks quickly"],
["The tall tree sways gently", "The short bush sways wildly"],
["The sweet apple tastes good", "The sour lemon tastes bad"],
["The loud music plays softly", "The quiet music plays loudly"],
["The loud music plays softly", "The quiet music plays loudly The quiet music plays loudly The quiet music plays loudly"],
["The loud music plays softly The loud music plays softly The loud music plays softly", "The quiet music plays loudly"],
]
/**
* @param {string} str1
* @param {string} str2
* @returns {number} similarity
* @see https://en.wikipedia.org/wiki/Levenshtein_distance
*/
function levenshteinDistance(str1, str2) {
const len1 = str1.length;
const len2 = str2.length;
const matrix = Array(len1 + 1).fill().map(() => Array(len2 + 1).fill(0));
for (let i = 0; i <= len1; i++) {
matrix[i][0] = i;
}
for (let j = 0; j <= len2; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= len1; i++) {
for (let j = 1; j <= len2; j++) {
if (str1[i - 1] === str2[j - 1]) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j] + 1, // deletion
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j - 1] + 1 // substitution
);
}
}
}
return 1 - (matrix[len1][len2] / Math.max(len1, len2));
}
/**
* @param {string} str1
* @param {string} str2
* @returns {number} similarity
* @see https://en.wikipedia.org/wiki/Cosine_similarity
* @see https://stackoverflow.com/questions/17388213/find-the-similarity-percent-between-two-strings
*/
function cosineSimilarity(str1, str2) {
const wordSet = new Set([...str1.split(' '), ...str2.split(' ')]);
const vec1 = Array.from(wordSet).map(word => {
const count = (str1.match(new RegExp(word, 'g')) || []).length;
return count;
});
const vec2 = Array.from(wordSet).map(word => {
const count = (str2.match(new RegExp(word, 'g')) || []).length;
return count;
});
const dotProduct = vec1.reduce((acc, val, i) => acc + val * vec2[i], 0);
const vec1Magnitude = Math.sqrt(vec1.reduce((acc, val) => acc + val * val, 0));
const vec2Magnitude = Math.sqrt(vec2.reduce((acc, val) => acc + val * val, 0));
const similarity = dotProduct / (vec1Magnitude * vec2Magnitude);
return similarity;
}
data.forEach(([str1, str2]) => {
console.log(`String 1: ${str1}`);
console.log(`String 2: ${str2}`);
console.log(`String similarity: ${levenshteinDistance(str1, str2).toFixed(1)}`);
console.log(`Cosine similarity: ${cosineSimilarity(str1, str2).toFixed(1)}`);
console.log();
});
for (let i = 0; i < 10e2; i++) {
data.forEach(([str1, str2]) => {
levenshteinDistance(str1, str2);
cosineSimilarity(str1, str2);
});
}
console.time('levenshteinDistance');
for (let i = 0; i < 10e4; i++) {
data.forEach(([str1, str2]) => levenshteinDistance(str1, str2));
}
console.timeEnd('levenshteinDistance');
console.time('cosineSimilarity');
for (let i = 0; i < 10e4; i++) {
data.forEach(([str1, str2]) => cosineSimilarity(str1, str2));
}
console.timeEnd('cosineSimilarity');
// levenshteinDistance: 11.460s
// cosineSimilarity: 6.029s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment