Skip to content

Instantly share code, notes, and snippets.

@matthewr6
Created February 4, 2017 21:45
Show Gist options
  • Save matthewr6/56300fdb1d4c17a1e811d1eed8213732 to your computer and use it in GitHub Desktop.
Save matthewr6/56300fdb1d4c17a1e811d1eed8213732 to your computer and use it in GitHub Desktop.
markov chain data generator I made a while ago
const fs = require('fs');
// data: output of learnFromTopic or learnFromSubforum
// initial: initial word
function simulate(data, initial) {
let terminate = false;
let generatedWords = [initial];
if (!data[initial]) return;
while (!terminate) {
const prevWord = generatedWords[generatedWords.length - 1];
const nextProbs = data[prevWord].probabilities;
const differentiator = Math.random();
let nextWord = null;
let totalProb = 0;
for (let word in nextProbs) {
totalProb += nextProbs[word];
if (totalProb > differentiator && (word != prevWord || totalProb >= 1)) {
// todo - json has to store keys as strings
if (word !== 'null' && generatedWords.length < 25) {
nextWord = word;
} else {
terminate = true;
}
break;
}
}
if (nextWord) {
generatedWords.push(nextWord);
}
}
return generatedWords.join(' ');
}
// posts is array of strings
function learnFromPosts(posts) {
let words = {};
posts.forEach(post => {
// do we want to strip punctuation?
postWords = post.content.replace( /[^a-zA-Z ]/g, '').replace( /\s\s+/g, ' ' ).split(' ');
postWords.forEach((word, index) => {
if (!words[word]) {
words[word] = {
frequencies: {},
total: 1,
probabilities: {}
};
} else {
words[word].total++;
}
// do we want to check if a word tends to be the last word of the post?
if (!postWords[index+1]) {
words[word].frequencies[null] = 1;
} else if (words[word].frequencies[postWords[index+1]] === undefined) {
words[word].frequencies[postWords[index+1]] = 1;
} else if (words[word].frequencies[postWords[index+1]] !== undefined) {
words[word].frequencies[postWords[index+1]]++;
}
});
});
for (let word in words) {
for (let nextWord in words[word].frequencies) {
words[word].probabilities[nextWord] = words[word].frequencies[nextWord] / words[word].total;
}
}
return words;
}
@an-OK-squirrel
Copy link

🤔

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment