Skip to content

Instantly share code, notes, and snippets.

@oiva
Last active August 29, 2015 14:23
Show Gist options
  • Save oiva/14dd1324931d912c3853 to your computer and use it in GitHub Desktop.
Save oiva/14dd1324931d912c3853 to your computer and use it in GitHub Desktop.
Muhku
var fs = require('fs');
var _ = require('lodash');
var originalWords = [];
fs.readFile('alastalon_salissa.txt', 'utf8', function (err,data) {
if (err) {
return console.log(err);
}
process(data);
});
// takes text data and returns all words in an array
function parseWords(data) {
data = data.toLowerCase();
// remove anything not a letter or white space
data = data.replace(/[^a-zåäö\s]/gi, '');
var words = data.split(/\s+/)
// cheat / optimisation: minimum length for words,
// takes 1-2 seconds off the total time.
var minimumLength = 6;
words = words.filter(function(word) {return word && word.length > minimumLength;});
return words;
}
// takes a word and returns its unique letters sorted, for example:
// 'muhku' => 'hkmu'
function mapWordToLetters(word) {
var letters = word.split('');
letters = letters.sort();
letters = _.uniq(letters, true);
reducedWord = letters.join('');
// store original word to global variable
originalWords[reducedWord] = word;
return reducedWord;
}
// removes consecutive words that are each others' substrings. Fast.
// for example: ['abc', 'muhk', 'muhku', 'muhkua'] => ['abc', 'muhkua']
function filterSimilarWords(words) {
var filteredWords = [];
var previousWord = '';
var word;
for (var i = words.length - 1; i >= 0; i--) {
word = words[i]
if (previousWord.indexOf(word) === -1) {
filteredWords.push(word);
}
previousWord = word;
};
return filteredWords;
}
// removes words whose letters are found in other words. Preserves words that
// have only one less letter than a better word.
// ['abcd', 'abcdex', 'abcdefx'] => ['abcdex', 'abcdefx']
// Returns character arrays because we use those later on.
function filterSubsets(words) {
var filteredWords = [], a1, a2, length, discard;
letterArrays = _.map(words, function(word) {
return word.split('');
});
length = letterArrays.length;
for (var i = 0; i < length; i++) {
a1 = letterArrays[i];
discard = false;
for (var j = i + 1; j < length; j++) {
a2 = letterArrays[j];
// don't discard 'muhku' if comparing to 'muhkua'
if (a1.length >= a2.length -1) {
continue;
}
// a1 is subset of a2
if (a1.length === _.intersection(a1, a2).length) {
discard = true;
break;
}
};
if (!discard) {
filteredWords.push(a1);
}
};
return filteredWords;
}
// calculates muhkeus score by counting the unique letters in the two given words.
// input: character arrays
// ['a', 'b', 'c'], ['a', 'e'] => 4
function calculateScore(word1, word2) {
return _.union(word1, word2).length;
}
// go through given words and calculate score for each possible pair. return
// highest scoring pairs
function findPairs(words) {
var word1, word2, pair, pairs = [], bestPairs, score, maxScore = 0;
for (var i = words.length - 1; i >= 0; i--) {
word1 = words[i];
for (var j = i - 1; j >= 0; j--) {
word2 = words[j];
score = calculateScore(word1, word2);
if (score >= maxScore) {
// convert arrays back to string, find original word from global array
pairs.push([score, originalWords[word1.join('')], originalWords[word2.join('')]]);
maxScore = score;
}
};
};
bestPairs = _.filter(pairs, function(pair) {
return pair[0] == maxScore;
});
return bestPairs;
}
function process(data) {
var words = parseWords(data);
words = _.map(words, mapWordToLetters);
words = words.sort();
console.log('1: '+words.length);
words = _.uniq(words, true); // unique for sorted data
console.log('2: '+words.length);
words = filterSimilarWords(words);
console.log('3: '+words.length);
words = filterSubsets(words);
console.log('4: '+words.length);
pairs = findPairs(words);
//console.log(_.slice(words, 0, 100));
console.log('word count before calculating score '+words.length);
console.log(pairs.length + ' pairs ');
console.log(pairs);
}
{
"name": "Muhk",
"description": "Find the muhkiest word",
"version": "1.0.0",
"dependencies": {
"lodash": "^3.9.3"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment