Last active
August 29, 2015 14:13
-
-
Save dariusk/3c395f28aaab4181716b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var _ = require('underscore'); | |
var fs = require('fs'); | |
var pos = require('pos'); | |
function getPos(sentence) { | |
var words = new pos.Lexer().lex(sentence); | |
var taggedWords = new pos.Tagger().tag(words); | |
taggedWords = _.map(taggedWords, function(tag) { | |
return tag[1]; | |
}); | |
return _.flatten(taggedWords).join(' '); | |
} | |
fs.readFile('corpus.txt', 'utf8', function(error, data) { | |
// Break our text file up into individual sentences (approximate). | |
var sentences = data.match( /[^\.!\?]+[\.!\?]+/g ); | |
console.log('Number of sentences:', sentences.length); | |
// Filter our list of sentences so we only keep ones | |
// matching certain criteria | |
sentences = _.filter(sentences, function(el) { | |
el = el.trim(); | |
return el.length < 50 | |
&& el.length > 20 | |
&& el.indexOf('"') === -1 | |
// && (el.split("'").length -1 < 2) | |
// ^^^ replacing with below, nicer version of same courtesy @jeresig | |
&& el.match(/'/g).length < 2 | |
&& el.indexOf('I ') === -1 | |
&& _.isNull(el.match(/\bmy\b/i)) | |
&& _.isNull(el.match(/\bme\b/i)) | |
&& _.isNull(el.match(/\bhe\b/i)) | |
&& _.isNull(el.match(/\bshe\b/i)) | |
&& _.isNull(el.match(/\byou\b/i)) | |
&& _.isNull(el.match(/\bhis\b/i)) | |
&& _.isNull(el.match(/\bher\b/i)); | |
}); | |
// Now we filter by part of speech | |
sentences = _.filter(sentences, function(el) { | |
var nounVerbs = !_.isNull(getPos(el).match(/NN.? VB[ZP]/)); | |
return nounVerbs; | |
}); | |
console.log('Number after filter:', sentences.length); | |
// print a sampling of ten results | |
console.log(_.sample(sentences, 10)); | |
console.log(getPos(' Malachi Mulligan is coming too.')); | |
}); |
(also, weird that gists don't seem to do @-replies?)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks @jeresig. I'm of two minds about this stuff. I love showing people good code, but also I try to be as honest as I can about the quality of my code when I'm doing creative coding, which is frankly not that great and is kind of on purpose. Good code is succinct, beautiful, and packs a lot of meaning in a small package: unfortunately this also makes it difficult for non-experts to unpack. There's also something I love about somewhat inefficient slightly-bad code leading to emergent properties (which would be bugs in an application, but I actually try to encourage in creative coding).
Using
match
instead ofsplit
is definitely a good change that I'll probably make when I have a moment.I'm inclined to leave the my/he/me/etc regexes as-is, though. I think many small regexes connected by logic are more intelligible to beginners than a very big regex, even if the "bigness" is just due to a whole bunch of OR operators that don't add much complexity.
Related: I wish I didn't have to include the regex on line 16 for splitting sentences naively.
But anyway, thanks for the feedback!