dariusk/aphorism-detection.js

## aphorism-detection.js
var _ = require('underscore');
var fs = require('fs');
var pos = require('pos');

function getPos(sentence) {
  var words = new pos.Lexer().lex(sentence);
  var taggedWords = new pos.Tagger().tag(words);
  taggedWords = _.map(taggedWords, function(tag) {
    return tag[1];
  });
  return _.flatten(taggedWords).join(' ');
}

fs.readFile('corpus.txt', 'utf8', function(error, data) {
  // Break our text file up into individual sentences (approximate).
  var sentences = data.match( /[^\.!\?]+[\.!\?]+/g );
  console.log('Number of sentences:', sentences.length);

  // Filter our list of sentences so we only keep ones
  // matching certain criteria
  sentences = _.filter(sentences, function(el) {
    el = el.trim();
    return el.length < 50
      && el.length > 20
      && el.indexOf('"') === -1
      // && (el.split("'").length -1 < 2)
      // ^^^ replacing with below, nicer version of same courtesy @jeresig
      && el.match(/'/g).length < 2
      && el.indexOf('I ') === -1
      && _.isNull(el.match(/\bmy\b/i))
      && _.isNull(el.match(/\bme\b/i))
      && _.isNull(el.match(/\bhe\b/i))
      && _.isNull(el.match(/\bshe\b/i))
      && _.isNull(el.match(/\byou\b/i))
      && _.isNull(el.match(/\bhis\b/i))
      && _.isNull(el.match(/\bher\b/i));
  });

  // Now we filter by part of speech
  sentences = _.filter(sentences, function(el) {
      var nounVerbs = !_.isNull(getPos(el).match(/NN.? VB[ZP]/));
      return nounVerbs;
  });

  console.log('Number after filter:', sentences.length);

  // print a sampling of ten results
  console.log(_.sample(sentences, 10));
  console.log(getPos(' Malachi Mulligan is coming too.'));
});
	var _ = require('underscore');
	var fs = require('fs');
	var pos = require('pos');

	function getPos(sentence) {
	var words = new pos.Lexer().lex(sentence);
	var taggedWords = new pos.Tagger().tag(words);
	taggedWords = _.map(taggedWords, function(tag) {
	return tag[1];
	});
	return _.flatten(taggedWords).join(' ');
	}

	fs.readFile('corpus.txt', 'utf8', function(error, data) {
	// Break our text file up into individual sentences (approximate).
	var sentences = data.match( /[^\.!\?]+[\.!\?]+/g );
	console.log('Number of sentences:', sentences.length);

	// Filter our list of sentences so we only keep ones
	// matching certain criteria
	sentences = _.filter(sentences, function(el) {
	el = el.trim();
	return el.length < 50
	&& el.length > 20
	&& el.indexOf('"') === -1
	// && (el.split("'").length -1 < 2)
	// ^^^ replacing with below, nicer version of same courtesy @jeresig
	&& el.match(/'/g).length < 2
	&& el.indexOf('I ') === -1
	&& _.isNull(el.match(/\bmy\b/i))
	&& _.isNull(el.match(/\bme\b/i))
	&& _.isNull(el.match(/\bhe\b/i))
	&& _.isNull(el.match(/\bshe\b/i))
	&& _.isNull(el.match(/\byou\b/i))
	&& _.isNull(el.match(/\bhis\b/i))
	&& _.isNull(el.match(/\bher\b/i));
	});

	// Now we filter by part of speech
	sentences = _.filter(sentences, function(el) {
	var nounVerbs = !_.isNull(getPos(el).match(/NN.? VB[ZP]/));
	return nounVerbs;
	});

	console.log('Number after filter:', sentences.length);

	// print a sampling of ten results
	console.log(_.sample(sentences, 10));
	console.log(getPos(' Malachi Mulligan is coming too.'));
	});