Skip to content

Instantly share code, notes, and snippets.

@dariusk
Last active August 29, 2015 14:13
Show Gist options
  • Save dariusk/3c395f28aaab4181716b to your computer and use it in GitHub Desktop.
Save dariusk/3c395f28aaab4181716b to your computer and use it in GitHub Desktop.
var _ = require('underscore');
var fs = require('fs');
var pos = require('pos');
function getPos(sentence) {
var words = new pos.Lexer().lex(sentence);
var taggedWords = new pos.Tagger().tag(words);
taggedWords = _.map(taggedWords, function(tag) {
return tag[1];
});
return _.flatten(taggedWords).join(' ');
}
fs.readFile('corpus.txt', 'utf8', function(error, data) {
// Break our text file up into individual sentences (approximate).
var sentences = data.match( /[^\.!\?]+[\.!\?]+/g );
console.log('Number of sentences:', sentences.length);
// Filter our list of sentences so we only keep ones
// matching certain criteria
sentences = _.filter(sentences, function(el) {
el = el.trim();
return el.length < 50
&& el.length > 20
&& el.indexOf('"') === -1
// && (el.split("'").length -1 < 2)
// ^^^ replacing with below, nicer version of same courtesy @jeresig
&& el.match(/'/g).length < 2
&& el.indexOf('I ') === -1
&& _.isNull(el.match(/\bmy\b/i))
&& _.isNull(el.match(/\bme\b/i))
&& _.isNull(el.match(/\bhe\b/i))
&& _.isNull(el.match(/\bshe\b/i))
&& _.isNull(el.match(/\byou\b/i))
&& _.isNull(el.match(/\bhis\b/i))
&& _.isNull(el.match(/\bher\b/i));
});
// Now we filter by part of speech
sentences = _.filter(sentences, function(el) {
var nounVerbs = !_.isNull(getPos(el).match(/NN.? VB[ZP]/));
return nounVerbs;
});
console.log('Number after filter:', sentences.length);
// print a sampling of ten results
console.log(_.sample(sentences, 10));
console.log(getPos(' Malachi Mulligan is coming too.'));
});
@dariusk
Copy link
Author

dariusk commented Jan 20, 2015

(also, weird that gists don't seem to do @-replies?)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment