Skip to content

Instantly share code, notes, and snippets.

@jacksonhoose
Last active August 29, 2015 14:12
Show Gist options
  • Save jacksonhoose/3a763a5b2ac2dbab824c to your computer and use it in GitHub Desktop.
Save jacksonhoose/3a763a5b2ac2dbab824c to your computer and use it in GitHub Desktop.
documentDistance
// Original version by Erik D. Demaine on January 31, 2011,
// based on code by Ronald L. Rivest (see docdist[1-7].py).
// Usage:
// node documentDistance.js file1.txt file2.txt
// # This program computes the "distance" between two text files
// # as the angle between their word frequency vectors (in radians).
// #
// # For each input file, a word-frequency vector is computed as follows:
// # (1) the specified file is read in
// # (2) it is converted into a list of alphanumeric "words"
// # Here a "word" is a sequence of consecutive alphanumeric
// # characters. Non-alphanumeric characters are treated as blanks.
// # Case is not significant.
// # (3) for each word, its frequency of occurrence is determined
// # (4) the word/frequency lists are sorted into order alphabetically
// #
// # The "distance" between two vectors is the angle between them.
// # If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i)
// # and y = (y1, y2, ..., yn) is the second vector,
// # then the angle between them is defined as:
// # d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y)))
// # where:
// # inner_product(x,y) = x1*y1 + x2*y2 + ... xn*yn
// # norm(x) = sqrt(inner_product(x,x))
var readFileSync = require('fs').readFileSync;
var files = process.argv.slice(2);
function documentDistance (files) {
var file1 = wordFrequenciesForFile(files[0]);
var file2 = wordFrequenciesForFile(files[1]);
return vectorAngle(file1, file2);
}
function readFile (file) {
return readFileSync(file, 'utf8');
}
function getWordsFromDocument (doc) {
return doc.replace(/[^a-zA-Z\d\s:]/g, '').replace(/\n\n/g, ' ');
}
function getWordFrequency (words) {
return words.split(' ').reduce(function(hash, word){
hash[word] = hash[word] ? hash[word] + 1 : 1;
return hash;
}, {});
}
function getInnerProduct (file1, file2) {
var sum = 0.0;
for(var key in file1){
if(file2[key]) {
sum += file1[key] * file2[key];
}
}
return sum;
}
function vectorAngle (file1, file2) {
var numerator = getInnerProduct(file1, file2);
var denominator = Math.sqrt(getInnerProduct(file1, file1) * getInnerProduct(file2, file2));
return Math.acos(numerator/denominator);
}
function wordFrequenciesForFile(file) {
file = readFile(file);
file = getWordsFromDocument(file);
return getWordFrequency(file);
}
console.log(documentDistance(files));
module.exports = documentDistance;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment