Last active
August 29, 2015 14:13
-
-
Save dariusk/3c395f28aaab4181716b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var _ = require('underscore'); | |
var fs = require('fs'); | |
var pos = require('pos'); | |
function getPos(sentence) { | |
var words = new pos.Lexer().lex(sentence); | |
var taggedWords = new pos.Tagger().tag(words); | |
taggedWords = _.map(taggedWords, function(tag) { | |
return tag[1]; | |
}); | |
return _.flatten(taggedWords).join(' '); | |
} | |
fs.readFile('corpus.txt', 'utf8', function(error, data) { | |
// Break our text file up into individual sentences (approximate). | |
var sentences = data.match( /[^\.!\?]+[\.!\?]+/g ); | |
console.log('Number of sentences:', sentences.length); | |
// Filter our list of sentences so we only keep ones | |
// matching certain criteria | |
sentences = _.filter(sentences, function(el) { | |
el = el.trim(); | |
return el.length < 50 | |
&& el.length > 20 | |
&& el.indexOf('"') === -1 | |
// && (el.split("'").length -1 < 2) | |
// ^^^ replacing with below, nicer version of same courtesy @jeresig | |
&& el.match(/'/g).length < 2 | |
&& el.indexOf('I ') === -1 | |
&& _.isNull(el.match(/\bmy\b/i)) | |
&& _.isNull(el.match(/\bme\b/i)) | |
&& _.isNull(el.match(/\bhe\b/i)) | |
&& _.isNull(el.match(/\bshe\b/i)) | |
&& _.isNull(el.match(/\byou\b/i)) | |
&& _.isNull(el.match(/\bhis\b/i)) | |
&& _.isNull(el.match(/\bher\b/i)); | |
}); | |
// Now we filter by part of speech | |
sentences = _.filter(sentences, function(el) { | |
var nounVerbs = !_.isNull(getPos(el).match(/NN.? VB[ZP]/)); | |
return nounVerbs; | |
}); | |
console.log('Number after filter:', sentences.length); | |
// print a sampling of ten results | |
console.log(_.sample(sentences, 10)); | |
console.log(getPos(' Malachi Mulligan is coming too.')); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
(also, weird that gists don't seem to do @-replies?)