redaktor/EXAMPLE.js

## blacklist.js
var blacklist = {
weaks: [
	"be",
	"am",
	"is",
	"are",
	"wa",
	"were",
	"been",
	"have",
	"do",
	"say",
	"go",
	"see",
	"give",
	"know",
	"want",
	"put",
	"seem",
	"stay",
	"speak",
	"find",
	"come",
	"think",
	"leav",
	"take",
	"feel",
	"watch",
	"begin",
	"hope",
	"exist",
	"work",
	"produc",
	"occur",
	"understand",
	"receiv",
	"appear",
	"serv",
	"need",
	"maintain",
	"chang",
	"introduc",
	"creat",
	"open",
	"consider",
	"hear",
	"finish",
	"convert",
	"form",
	"bring",
	"achiev",
	"suppos",
	"get",
	"got",
	"reach",
	"run",
	"ran",
	"use",
	"help",
	"show",
	"move",
	"happen",
	"fix",
	"set"
],
fillers: [
	"absolutely",
	"actual",
	"actually",
	"anyway",
	"apparently",
	"approximately",
	"badly",
	"basically",
	"begin",
	"certainly",
	"clearly",
	"completely",
	"definitely",
	"easily",
	"effectively",
	"entirely",
	"especially",
	"essentially",
	"exactly",
	"extremely",
	"fairly",
	"frankly",
	"frequently",
	"fully",
	"generally",
	"hardly",
	"heavily",
	"highly",
	"hopefully",
	"just",
	"largely",
	"like",
	"literally",
	"maybe",
	"might",
	"most",
	"mostly",
	"much",
	"necessarily",
	"nicely",
	"obviously",
	"ok",
	"okay",
	"particularly",
	"perhaps",
	"possibly",
	"practically",
	"primarily",
	"probably",
	"precisely",
	"quite",
	"rather",
	"real",
	"really",
	"relatively",
	"right",
	"seriously",
	"significantly",
	"simply",
	"slightly",
	"so",
	"specifically",
	"start",
	"strongly",
	"surely",
	"too",
	"totally",
	"truly",
	"try",
	"typically",
	"ultimately",
	"usually",
	"very",
	"virtually",
	"whatever",
	"well",
	"whenever",
	"wherever",
	"whoever",
	"widely"
],
vulgars: [
	"anal",
	"anus",
	"arabush",
	"arse",
	"arsehole",
	"ass",
	"asshole",
	"ballsack",
	"balls",
	"bastard",
	"bitch",
	"biatch",
	"bloody",
	"blowjob",
	"blow job",
	"bluegum",
	"bollock",
	"bollok",
	"boner",
	"boob",
	"bugger",
	"bum",
	"butt",
	"buttcrack",
	"buttplug",
	"chinaman",
	"clit",
	"clitoris",
	"cock",
	"cocksucker",
	"coon",
	"crap",
	"cunt",
	"damn",
	"dick",
	"dickhead",
	"dildo",
	"dyke",
	"fag",
	"feck",
	"fellate",
	"fellatio",
	"felching",
	"fuck",
	"fuckhead",
	"f u c k",
	"fudgepacker",
	"fudge packer",
	"flange",
	"goddamn",
	"gable",
	"god damn",
	"handjob",
	"hell",
	"homo",
	"jerk",
	"jizz",
	"knobend",
	"knob end",
	"labia",
	"lmao",
	"lmfao",
	"muff",
	"nigger",
	"nigga",
	"niggar",
	"omg",
	"penis",
	"piss",
	"poop",
	"prick",
	"pube",
	"pussy",
	"queer",
	"scrotum",
	"shit",
	"s hit",
	"sh1t",
	"slut",
	"smegma",
	"spunk",
	"sucker",
	"tit",
	"tosser",
	"turd",
	"twat",
	"vagina",
	"wank",
	"whore",
	"wtf"
]
};

var main = {};
main.weak = new RegExp( '^'.concat(blacklist.weaks.join('|^')), 'gi' );
main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$|^'), '$'), 'gi' );
main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('|^')), 'gi' );

if (typeof module !== "undefined" && module.exports) {
	module.exports = main;
}

## EXAMPLE.js
// TODO - make logic_negate and abbreviations to lexicon as resource file (i18n, language aware, seperate data and logic)
// the best way might be a dictionary with flags where we can easily derive the lexicon by Object.keys and map, like
/* dictionary: {
	"CP": [
		{v:'is', weak: 1},
		...
	],
	...
};
*/
var nlp = require('nlp_compromise');
var util = require('util');

//var TEST = 'The cats we saw, e.g. tigers or leopards, are nice. I am a perfect second sentence for them. This is actually not. We\'re exclamative! Let us look back. They were beaten.';
var TEST = 'He was told that they have been hardly wounded.'
/* TODO - options, like "optimize metrics for"
// example:
// TICKS, e.g. LONGSENTENCE <:
news / mobile 		18
story / desktop 	25
longread 			30
scientific text 	45
*/

/*
TODO - use important rules from the stylebooks of AP, APA (en) and dpa (de)

e.g.:

+ (related to dates) : ages:
For ages, always use figures. If the age is used as an adjective or as a substitute for a noun, then it should be hyphenated. Don't use apostrophes when describing an age range.
Examples: A 21-year-old student. The student is 21 years old. The girl, 8, has a brother, 11. The contest is for 18-year-olds. He is in his 20s.


Please note, that medical and political titles only need to be used on first reference when they appear outside of a direct quote.
For courtesy titles, use these on second reference or when specifically requested.
Other acronyms and abbreviations are acceptable but not required (i.e. FBI, CIA, GOP). The context should govern such decisions. Avoid "alphapet soup" ...

Use quotation marks around the titles of books, songs, television shows, computer games, poems, lectures, speeches and works of art.
Examples: Author Porter Shreve read from his new book, "When the White House Was Ours." They sang "The Star-Spangled Banner" before the game.
Do not use quotations around the names of magazine, newspapers, the Bible or books that are catalogues of reference materials.
Examples: The Washington Post first reported the story. He reads the Bible every morning.

When used with a date, abbreviate only the following months: Jan., Feb., Aug., Sept., Oct., Nov. and Dec.
*/
var c = {
	LONGSENTENCE: 40,
	SHORTSENTENCE: 5,
};

// TODO FIXME - should go to lexicon
/*
NOTE: better performance when we use the following additional tagging already when stemming:
'AUX':  'auxillary verbs'
'WDT':  'wh-determiner',		// WHICH, WHAT, WHOSE
'WP':   'wh-pronoun', 			// WHICH, WHAT, WHO, WHOM
'WRB':  'wh-adverb',			// HOW, WHEN, WHENCE, WHERE, WHY

'TO':   'to', // ?
'RP':   'Particle',				// it would be useful if there is RPP for positive particles and RPN for negative
								// and if there would be an "opposite" mapping ...
								// note currently only "not" is handled and it stems as a "CC"

'LS':   'List item marker',
'PDT':  'Predeterminer',
'POS':  'Possessive ending',
'SYM':  'Symbol (mathematical or scientific)',
':':    'colon',
'(':    'open parenthesis',
'``':   'open quote',
"''":   'close quote',
'#':    'pound sign (currency marker)',
'$':    'dollar sign (currency marker)',
')':    'close parenthesis',
',':    'comma',
'.':    'period'

// ?
'WP$':  'Possessive wh-pronoun', // how about demonstrativePronouns ?
*/
// auxillary verbs
var auxVerbs = ['do', 'does', 'did', 'have', 'has', 'had', 'having', 'be', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'shall', 'will', 'should', 'would', 'can', 'could', 'may', 'might', 'must'];

// auxillary verbs and other verbs in verb groups;
var verbGroups = [
	// first item is already known as any verb or auxVerb

	// TODO better: pos_reason VB verb ed
	{
		aux: ['have', 'has', 'had', 'having'],
		verbs: /(en$)|(ed$)/
	},
	{
		aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'being', 'to be'],
		verbs: /ing$/
	},
	{
		aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'to be'],
		verbs: /(en$)|(ed$)/
	}
	// last item SHOULD be a verb except auxVerbs or 'copula-adjective' - TODO - How to express in lexicon ?
];
// passive voice
var passiveVoiceAux = ["am", "is", "are", "was", "were", "be", "been", "being"];

// subset of determiners
var demonstrativePronouns = ['this', 'that', 'these', 'those', 'such', 'none', 'neither'];
var specialDemonstrativePronouns = ['this', 'that'];
var whDeterminers = ['which', 'what', 'whose'];
// other wh-stuff, see http://www.garfixia.nl/k/news/view/1442/15/the-what-why-and-how-of-wh-words.html
var whPronouns = ['which', 'what', 'who', 'whom'];
var whAdverbs = ['how', 'when', 'whence', 'where', 'why'];
// entity substitutions
var entitySubstitutions = ['it', 'he', 'him', 'she', 'her', 'i', 'me', 'we', 'us', 'they', 'them', 'you', 'there', 'here', 'thing', 'stuff', 'fact', 'this', 'that'];

// nominalizations
var nominalizationRe = new RegExp('(?:ion|ions|ism|isms|ty|ties|ment|ments|ness|nesses|ance|ances|ence|ences)$');

// end ^ TODO FIXME - should go to lexicon


// EXTEND ARRAY PROTOTYPE
Array.prototype.average = function() {
	// TODO - in other contexts we MUST handle values other than typeof 'number' !!!
	var r = {mean: 0, variance: 0, deviation: 0}, t = this.length;
	for(var m, s = 0, l = t; l--; s += this[l]);
	for(m = r.mean = s / t, l = t, s = 0; l--; s += Math.pow(this[l] - m, 2));
	return r.deviation = Math.sqrt(r.variance = s / t), r;
};
Array.prototype.unique = function() {
	return this.reduce(function(p, c) {
		if (p.indexOf(c) < 0) p.push(c);
		return p;
	}, []);
};
Array.prototype.sequences = function() {
    var lastI = -1;
    var results = [[]];
    var that = this;
    this.forEach(function(i, j) {
        if (i != lastI+1 && lastI>-1) results.push([]);
        results[results.length - 1].push(i);
        lastI = i;
    });
    return results;
};

function decimals(f, dec) {
	// TODO, v2
	// stub, currently used for toPercent which will become readable with
	// decimals rounded and percent values and value/unit etc.
	// we SHOULD round the 2nd decimal ...
	if (!dec) dec = 2;
	return parseFloat(f.toFixed(2));
}
function calculateMetrics(txt) {
	var processed = nlp.pos(txt);
	var metrics = {
		sentenceCount: 0,
		wordCount: 0,
		characterCount: 0,
		characterCountTrimmed: 0,
		uselessBoundaries: 0,
		vocabularySize: 0,
		wordsPerSentence: 0,
		wordsPerSentenceStd: -1,
		longSentencesRatio: 0,
		shortSentencesRatio: 0,
		declarativeRatio: 0,
		interrogativeRatio: 0,
		exclamativeRatio: 0,
		charactersPerWords: 0,
		syllablesPerWord: 0,
		negationsPerSentence: 0,
		stopwordRatio: 0,
		nounRatio: 0,
		nounClusterRatio: 0,
        pronounRatio: 0,
        verbRatio: 0,
        adjectiveRatio: 0,
        adverbRatio: 0,
        otherPosRatio: 0,
		modalRatio: 0,
		nominalizationRatio: 0,
		entitySubstitutionRatio: 0,
		weakVerbRatio: 0,
		vulgarWordRatio: 0,
		verbGroupsPerSentence: 0,
		passiveVoicePerSentence: 0,
		fillerRatio: 0,
		readability: 0
	};
	var sentences = processed.sentences;

	// count number of sentences
	// sentenceCount
	metrics.sentenceCount = sentences.length;
	var stems = [];
	var sentencesCounts = [];
	var charactersPerWordsCounts = [];
	var syllablesCount = 0;
	var negationsCount = 0;
	// depends on other nouns
	var nounClusterCount = 0;
	// depends on wordCount
	var tCounts = {
		noun: 0,
		pronoun: 0,
		pronounNonpossesive: 0,
		verb: 0,
		adverb: 0,
		adjective: 0,
		modalVerb: 0,
		weakVerb: 0,
		vulgarWord: 0,
		filler: 0
	};


	// question: we have 1 minor issue with the TAGS:
	// "CP" is a copula verb but a verb. We think it is e.g. different from noun/pronoun relation - SHOULD it be called VCP ???
	var _types = { N: 'noun', P: 'pronoun', V: 'verb', C: 'verb', R: 'adverb', J: 'adjective', M: 'modalVerb' };

	// for further calculation purposes
	var data = {
		nominalizations: [],
		entitySubstitutions:[]
	};

	var nounCluster = function(token, _nounsCount) {
		if (!_nounsCount || _nounsCount < 1) _nounsCount = token.normalised.match(/\S+/g).length;
		// count clustered nouns (3 with possibly 'of')
		var n = token.analysis.next;
		if (n && _nounsCount < 10 && (n.pos.tag.slice(0,1) === 'N' || n.normalised === 'of')) {
			if (n.normalised != 'of') _nounsCount++;
			nounCluster(token, _nounsCount);
		} else if (_nounsCount > 2) {
			return _nounsCount;
		} else {
			return 0;
		}
	}

	var verbGroupBegin = function(o) {
		return (o.hasOwnProperty('pos') && o.analysis.next && (o.pos.parent === 'verb' || auxVerbs.indexOf(o.normalised) > -1));
	}
	var verbGroupEnd = function(o) {
		return (o.hasOwnProperty('pos') && (o.pos.parent === 'verb' && auxVerbs.indexOf(o.normalised) < 0) || o.pos_reason === 'copula-adjective');
	}


	sentences.forEach(function(sentence, sI) {
		//var sText = sentence.text();
		console.log( '!s', sentence.text() );

		sentences[sI].metrics = {};
		// count number of words
		// wordCount
		data.nominalizations[sI] = [];
		data.entitySubstitutions[sI] = [];
		if (!(sentences[sI].hasOwnProperty('groupTokens'))) sentences[sI].metrics.groupTokens = [];

		metrics.wordCount = metrics.wordCount+sentence.tokens.length;

		// count verb groups
		// handled rule group id and last group token
		var l = 0;
		var groupId = 0;
		var last = {i:0};
		var missingEnd = false;
		stems = stems.concat(sentence.tokens.map(function(token, i){

			if (!(sentences[sI].metrics.groupTokens.length)) sentences[sI].metrics.groupTokens.push([]);
			l = (sentences[sI].metrics.groupTokens.length);

			// count verb groups
			if ( (!(last.i) || last.i < i) && verbGroupBegin(token)) {
				// could be a normalized verb group
				// note: does not cover phrasal verbs
				var next = token.analysis.next;
				var iNext = i+1;
				verbGroups.every(function(group, gI) {
					if (gI >= groupId) {
						if ((group.aux.indexOf(next.normalised) > -1 || group.verbs.test(next.normalised) || next.pos_reason === 'copula-adjective')) {

							groupId = gI;
							sentences[sI].metrics.groupTokens[l-1].push(i);
							sentences[sI].metrics.groupTokens[l-1].push(iNext);
							last = sentence.tokens[iNext];
							last.i = iNext;
							return false;
						}
					}
				});
			}

			console.log( last.i, i );
			// seperate multiple verb groups TODO TEST - "special clusters"
			l = (sentences[sI].metrics.groupTokens.length);
			if (last.i === i && verbGroupEnd(token)) {
				groupId = 0;
				sentences[sI].metrics.groupTokens.push([]);
			} else if (i > 0 && last.i != i && !verbGroupEnd(last)) {
				console.log( 'hasEnd', verbGroupEnd(last), token.text );
				if (verbGroupEnd(token)) {
					console.log( 'Could be End: ', token.text );
					sentences[sI].metrics.groupTokens[l-1].push(i);
					groupId = 0;
					sentences[sI].metrics.groupTokens.push([]);
				}
			}

			//console.log(token.pos.tag, token.normalised, token.pos_reason/*, token*/);

			// TODO - ISSUE with negation logic_negate just works in one direction FIXME CONTRIB
			// test http://rawgit.com/spencermountain/nlp_compromise/master/client_side/basic_demo/index.html :
			// example: joe never swims in the pool.
			if (token.analysis.negative) negationsCount++;
			//console.log( 'token: ', token );

			data.entitySubstitutions[sI][i] = (token.normalised != 'i' && (entitySubstitutions.indexOf(token.normalised) > -1) && !(token.capitalised));
			if (data.entitySubstitutions[sI][i] && specialDemonstrativePronouns.indexOf(token.normalised) > -1) {

				if (token.analysis.last) {
					var firsttwo = token.analysis.last.pos.tag.slice(0,2);
					if (['NN', 'PR'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
				}
				if (token.analysis.next) {
					var firsttwo = token.analysis.next.pos.tag.slice(0,2);
					if (['NN', 'PR', 'JJ', 'DT'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
					/*, 'WD', 'WP' // see above TODO, handled below*/
					if (whDeterminers.concat(whPronouns).indexOf(token.normalised) > -1) data.entitySubstitutions[sI][i] = false;
				}
			}
			if (data.entitySubstitutions[sI][i]) sentences[sI].tokens[i].metrics.entitySubstitution = true;


			// count number of different parts of speech
			var typeId = token.pos.tag.slice(0,1);
			console.log( 'token: ', token.text, token.pos.tag, token.pos.parent, token.pos_reason );
			//console.log( 'token3: ',typeId, _types[typeId] );

			if (_types.hasOwnProperty(typeId)) tCounts[_types[typeId]]++;

			// count characters per words
			charactersPerWordsCounts.push(token.text.length);

			data.nominalizations[sI][i] = false;
			if (typeId === 'N') {
				// count clustered nouns
				var curClusterCount = nounCluster(token);
				if (curClusterCount) nounClusterCount += curClusterCount;
				// count nominalizations
				var isNNP = token.pos.tag.indexOf('NNP' === 0);
				if (isNNP) data.nominalizations[sI][i] = (token.text.length > 7) && (token.normalised.search(nominalizationRe));
			}
			if (data.nominalizations[sI][i]) sentences[sI].tokens[i].metrics.nominalization = true;


			if (typeId === 'V') {
				// count weak verbs
				var check = (token.pos.tense === 'present') ? token.normalised : token.analysis.conjugate().infinitive;
				if (nlp.blacklist.weak.test(check)) tCounts.weakVerb++;
			}

			// count vulgar words, fillers etc.
			if (nlp.blacklist.vulgar.test(token.normalised)) tCounts.vulgarWord++;
			if (nlp.blacklist.filler.test(token.normalised)) tCounts.filler++;

			var syllables = nlp.syllables(token.text);
			if (syllables) syllablesCount = syllablesCount + syllables.length;
			return token.normalised;
		}));

		if (sentences[sI].metrics.groupTokens.length) sentences[sI].metrics.groupTokens = sentences[sI].metrics.groupTokens.filter(function(ts) {
			return (ts.length);
		});

		if (sentences[sI].metrics.groupTokens.length) {
			// we found verb groups ...
			var readableTokens = sentences[sI].metrics.groupTokens.map(function(ts) {
				return ts.map(function(tId) { return sentences[sI].tokens[tId].normalised }).join(' ');
			});

			sentences[sI].metrics.passiveVoiceTokens = [];
			sentences[sI].metrics.groupTokens.forEach(function(ts, i) {
				var isPassive = -1;
				ts.forEach(function(tId) {
					sentences[sI].tokens[tId].verbGroup = i;
					console.log( sentences[sI].tokens[tId].pos_reason );
					if (passiveVoiceAux.indexOf(sentences[sI].tokens[tId].normalised) > -1 ||
						sentences[sI].tokens[tId].pos_reason === 'copula-adjective' ||
						sentences[sI].tokens[tId].pos_reason === 'ed'
					) 	isPassive++;
				});
				if (isPassive > 0) sentences[sI].metrics.passiveVoiceTokens.push(ts);
			})

			console.log( 'sentence end, verb groups raw: ', sentences[sI].metrics.groupTokens );
			console.log( 'verb groups: ', readableTokens );
			console.log( 'passive groups: ', sentences[sI].metrics.passiveVoiceTokens );
		}

		sentencesCounts.push(sentence.tokens.length);

	});
	// TODO - find phrasal verbs


	if (sentencesCounts.length > 0) {
		// count number of words per sentence and its standard deviation
		if (metrics.sentenceCount) {
			// wordsPerSentence
			if (sentencesCounts.length > 0) metrics.wordsPerSentence = (sentencesCounts.reduce(function(a, b) { return a + b; })) / metrics.sentenceCount;
			// negationsPerSentence
			if (negationsCount) metrics.negationsPerSentence = negationsCount / metrics.sentenceCount;
		}
		if (sentences.length >= 10) {
			// wordsPerSentenceStd
			metrics.wordsPerSentenceStd = sentencesCounts.average().deviation;
		}
	}

	// find extra long and short sentences
	if (sentences.length) {
		// longSentencesRatio
		var _longs = sentencesCounts.filter(function(sCount){
			if (sCount >= c.LONGSENTENCE) return 1;
		});
		metrics.longSentencesRatio = _longs.length / sentencesCounts.length;
		// shortSentencesRatio
		var _shorts = sentencesCounts.filter(function(sCount){
			if (sCount <= c.SHORTSENTENCE) return 1;
		});
		metrics.shortSentencesRatio = _shorts.length / sentencesCounts.length;

		if (metrics.sentenceCount) {
			// count sentence types based on ending punctuation mark
			// declarativeRatio, interrogativeRatio, exclamativeRatio
			var types = sentences.map(function(s){ return s.type; });
			['declarative', 'interrogative', 'exclamative'].forEach(function(type){
				var typeCount = types.filter(function(v) { return v === type; }).length;
				metrics[type.concat('Ratio')] = typeCount / metrics.sentenceCount;
			});
		}
	}
	// find vocabulary size
	// vocabularySize
	metrics.vocabularySize = stems.unique().length;

	// count number of characters in the whole RAW text
	// characterCount
	var d = txt.trim();
	metrics.characterCount = d.length;

	var uselessBoundaries = d.match(/[\s\t]{2,}/g);
	if (uselessBoundaries) {
		var ub = uselessBoundaries.map(function(b) { return b.length; });
		metrics.uselessBoundaries = ub.length;
		metrics.characterCountTrimmed  = d.length - (ub.reduce(function(a, b) { return a + b; }) - ub.length);
	} else {
		metrics.characterCountTrimmed = d.length;
	}
	// counts per sentence
    if (metrics.sentenceCount) {
		// count verb Groups
		// verbGroupsPerSentence
		var groupsCount = sentences.map(function(s){return s.metrics.groupTokens.length||0;}).reduce(function(a, b) {return a+b;});
		metrics.verbGroupsPerSentence = groupsCount / metrics.sentenceCount;
		// count passive voice cases
		// passiveVoicePerSentence (special verb groups)
		var passiveVoiceCount = sentences.map(function(s){return s.metrics.passiveVoiceTokens.length||0;}).reduce(function(a, b) {return a+b;});
		metrics.passiveVoicePerSentence = passiveVoiceCount / metrics.sentenceCount;
	}
	// counts per word
    if (metrics.wordCount) {
		// count number of syllables per word
		// syllablesPerWord
		if (syllablesCount) metrics.syllablesPerWord = syllablesCount/metrics.wordCount;
		// count number of characters per word
		// charactersPerWords
		if (charactersPerWordsCounts) metrics.charactersPerWords = (charactersPerWordsCounts.reduce(function(a, b) {return a+b;})) / metrics.wordCount;

		// ratio for types of words, weak and vulgar words
		['noun', 'pronoun', 'verb', 'adverb', 'adjective', 'modalVerb', 'weakVerb', 'vulgarWord', 'filler'].forEach(function(d) {
			if (tCounts[d]) metrics[d.concat('Ratio')] = tCounts[d] / metrics.wordCount;
		});
		metrics.otherPosRatio = 1 - metrics.nounRatio - metrics.pronounRatio - metrics.verbRatio - metrics.adjectiveRatio - metrics.adverbRatio;
	}
	// counts per nouns
    if (tCounts.noun) {
		// nounRatio
		if (nounClusterCount) metrics.nounClusterRatio = nounClusterCount / tCounts.noun;
		// nominalizationRatio and entitySubstitutionRatio :
		// TODO - make sure tCounts.noun contain what python NLT calls "pronoun_nonpossesive"
		var nominCount = 0;
		data.nominalizations.forEach(function(n, sI) { nominCount += n.filter(function(v){ return (v); }).length });
		metrics.nominalizationRatio = nominCount / tCounts.noun;
		var entitySubCount = 0;
		data.entitySubstitutions.forEach(function(n, sI) { entitySubCount += n.filter(function(v){ return (v); }).length });
        metrics.entitySubstitutionRatio = entitySubCount / tCounts.noun;

	}
	// estimate test readability using Flesch-Kincaid Grade Level test
	// TODO short texts ...
	if (/*(metrics.wordCount >= 100) &&*/ metrics.wordsPerSentence && metrics.syllablesPerWord) {
        metrics.readability = 0.39 * metrics.wordsPerSentence + 11.8 * metrics.syllablesPerWord - 15.59;
	}

	// count number of stopwords
	// stopwordRatio


	/* TODO
	+ Named-Entities (dynamic) !!!
	? rare words / rareWordsRatio


	//
	# count number of stopwords
    data['stopwords'] = [None] * len(tokens)
    for idx, word in enumerate(words):
        if word in stopset:
            metrics['stopword_ratio'] += 1
            data['stopwords'][word2token_map[idx]] = True
        else:
            data['stopwords'][word2token_map[idx]] = False
    if metrics['wordCount']:
        metrics['stopword_ratio'] /= metrics['wordCount']


    # count rare words
    if len(words):
        metrics['rare_word_ratio'] = data['expected_word_frequencies'].count(0) / len(words)
    else:
        metrics['rare_word_ratio'] = 0

    # count word, bigram, and trigram frequencies
	// ...


	// ???
	# fix some verbs ending in -ing being counted as nouns
    for idx, token in enumerate(tokens):
        if (token[-3:] == 'ing') and (idx < len(tokens)) and (data['parts_of_speech'][idx+1] == 'IN'):
            data['parts_of_speech'][idx] = 'VBG'

	// ??? see below
    # find auxiliary verbs
    for i in range(verb_group_count):
        verb_group_stack = [idx for idx in range(len(tokens)) if data['verb_groups'][idx] == i+1]
        for j in verb_group_stack[:-1]:
            auxiliary_verbs[j] = True

	// ???
        data['weak_verbs'][idx] = (data['parts_of_speech'][idx][:2] == 'VB') and (data['stems'][idx] in dict_weak_verbs)
        if data['weak_verbs'][idx] and auxiliary_verbs[idx]:
            data['weak_verbs'][idx] = False
	*/
	stems = null;
	return metrics;
}

var toPercent = function(o) {
	var percentMetrics = {};
	for (var k in o) {
		percentMetrics[k.replace('Ratio', 'Percent')] = decimals( (k.indexOf('Ratio')<0) ? o[k] : ((o[k]) ? o[k]*100 : 0) );
	}
	return percentMetrics;
}


var metrics = calculateMetrics(TEST);
console.log( metrics );
console.log( toPercent(metrics) );


/* appendix, reasoning
NN
"before a modal"			//if it's before a modal verb, it's a noun -> lkjsdf would
"determiner-verb"			//if it's after a determiner, it's not a verb -> the walk
"capitalised"				//it has a capital and isn't first word
"need one verb" 			//if there no verb in the sentence, there needs to be.

VB
"after an adverb"			//if it's after an adverb, it's not a noun -> quickly acked
"ed" 						//set ambiguous 'ed' endings as either verb/adjective

RB
"consecutive_adjectives" 	//no consecutive, unpunctuated adjectives -> real good

JJ
"copula-adjective" 			//copulas are followed by a determiner ("are a .."), or an adjective ("are good")
"copula-adverb-adjective" 	//copula, adverb, verb -> copula adverb adjective -> is very lkjsdf

UH
"wordless_string" 			//punctuation - like ' -- ' etc.

CD
"parsefloat" 				//see if it's a number
---

lex
"lexicon" 					//known words list

parts_of_speech[wordnet_suffixes[suffix]]
"wordnet suffix"			//suffix pos signals from wordnet

r
"regex suffix" 				// suffix regexes for words

// + last pass
sentence.tokens = sentence.tokens.map(function(token, i) {
	var next = sentence.tokens[i + 1]
	var prev = sentence.tokens[i - 1]
	if (token.pos) {
		//suggest noun after determiners (a|the), posessive pronouns (her|my|its)
		if (token.pos.tag == "DT" || token.pos.tag == "PP") {
			need = 'NN'
			reason = token.pos.name
		}
		//suggest verb after personal pronouns (he|she|they), modal verbs (would|could|should)
		if (token.pos.tag == "PRP" || token.pos.tag == "MD") {
			need = 'VB'
			reason = token.pos.name
		}

	}
	if (need && !token.pos) {
		token.pos = parts_of_speech[need]
		token.pos_reason = "signal from " + reason
	}
	if (need == 'VB' && token.pos.parent == 'verb') {
		need = null
	}
	if (need == 'NN' && token.pos.parent == 'noun') {
		need = null
	}
	return token
})
*/

## index.js
// nlp_comprimise by @spencermountain  in 2014
// most files are self-contained modules that optionally export for nodejs
// this file loads them all together

// if we're server-side, grab files, otherwise assume they're prepended already
if (typeof module !== "undefined" && module.exports) {

  var parents = require("./src/parents/parents")

  var sentence_parser = require('./src/methods/tokenization/sentence').sentences;
  var tokenize = require('./src/methods/tokenization/tokenize').tokenize;
  var ngram = require('./src/methods/tokenization/ngram').ngram;
  //tokenize
  var normalize = require('./src/methods/transliteration/unicode_normalisation')
  var syllables = require('./src/methods/syllables/syllable');
  //localization
  var local = require('./src/methods/localization/britishize')
  var americanize = local.americanize;
  var britishize = local.britishize;
  //part of speech tagging
  var pos = require('./src/pos');
  //named_entity_recognition
  var spot = require('./src/spot');
  //weak verbs, vulgar words etc. TODO - goes to metrics ...
  var bl = require('./src/data/blacklist');
}

///
// api
var nlp = {
  noun: parents.noun,
  adjective: parents.adjective,
  verb: parents.verb,
  adverb: parents.adverb,
  value: parents.value,

  sentences: sentence_parser,
  ngram: ngram,
  tokenize: tokenize,
  americanize: americanize,
  britishize: britishize,
  syllables: syllables,
  normalize: normalize.normalize,
  denormalize: normalize.denormalize,
  pos: pos,
  spot: spot,
  blacklist: bl
  // tests: tests,
};

//export it for server-side
if (typeof module !== "undefined" && module.exports) {
  module.exports = nlp;
}

// bump bower
// git tag -a v0.3.5 -m "tag bower release"
// git push origin master --tags

// console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() )
// console.log( nlp.pos('i will slouch').to_past().text() )
	var blacklist = {
	weaks: [
	"be",
	"am",
	"is",
	"are",
	"wa",
	"were",
	"been",
	"have",
	"do",
	"say",
	"go",
	"see",
	"give",
	"know",
	"want",
	"put",
	"seem",
	"stay",
	"speak",
	"find",
	"come",
	"think",
	"leav",
	"take",
	"feel",
	"watch",
	"begin",
	"hope",
	"exist",
	"work",
	"produc",
	"occur",
	"understand",
	"receiv",
	"appear",
	"serv",
	"need",
	"maintain",
	"chang",
	"introduc",
	"creat",
	"open",
	"consider",
	"hear",
	"finish",
	"convert",
	"form",
	"bring",
	"achiev",
	"suppos",
	"get",
	"got",
	"reach",
	"run",
	"ran",
	"use",
	"help",
	"show",
	"move",
	"happen",
	"fix",
	"set"
	],
	fillers: [
	"absolutely",
	"actual",
	"actually",
	"anyway",
	"apparently",
	"approximately",
	"badly",
	"basically",
	"begin",
	"certainly",
	"clearly",
	"completely",
	"definitely",
	"easily",
	"effectively",
	"entirely",
	"especially",
	"essentially",
	"exactly",
	"extremely",
	"fairly",
	"frankly",
	"frequently",
	"fully",
	"generally",
	"hardly",
	"heavily",
	"highly",
	"hopefully",
	"just",
	"largely",
	"like",
	"literally",
	"maybe",
	"might",
	"most",
	"mostly",
	"much",
	"necessarily",
	"nicely",
	"obviously",
	"ok",
	"okay",
	"particularly",
	"perhaps",
	"possibly",
	"practically",
	"primarily",
	"probably",
	"precisely",
	"quite",
	"rather",
	"real",
	"really",
	"relatively",
	"right",
	"seriously",
	"significantly",
	"simply",
	"slightly",
	"so",
	"specifically",
	"start",
	"strongly",
	"surely",
	"too",
	"totally",
	"truly",
	"try",
	"typically",
	"ultimately",
	"usually",
	"very",
	"virtually",
	"whatever",
	"well",
	"whenever",
	"wherever",
	"whoever",
	"widely"
	],
	vulgars: [
	"anal",
	"anus",
	"arabush",
	"arse",
	"arsehole",
	"ass",
	"asshole",
	"ballsack",
	"balls",
	"bastard",
	"bitch",
	"biatch",
	"bloody",
	"blowjob",
	"blow job",
	"bluegum",
	"bollock",
	"bollok",
	"boner",
	"boob",
	"bugger",
	"bum",
	"butt",
	"buttcrack",
	"buttplug",
	"chinaman",
	"clit",
	"clitoris",
	"cock",
	"cocksucker",
	"coon",
	"crap",
	"cunt",
	"damn",
	"dick",
	"dickhead",
	"dildo",
	"dyke",
	"fag",
	"feck",
	"fellate",
	"fellatio",
	"felching",
	"fuck",
	"fuckhead",
	"f u c k",
	"fudgepacker",
	"fudge packer",
	"flange",
	"goddamn",
	"gable",
	"god damn",
	"handjob",
	"hell",
	"homo",
	"jerk",
	"jizz",
	"knobend",
	"knob end",
	"labia",
	"lmao",
	"lmfao",
	"muff",
	"nigger",
	"nigga",
	"niggar",
	"omg",
	"penis",
	"piss",
	"poop",
	"prick",
	"pube",
	"pussy",
	"queer",
	"scrotum",
	"shit",
	"s hit",
	"sh1t",
	"slut",
	"smegma",
	"spunk",
	"sucker",
	"tit",
	"tosser",
	"turd",
	"twat",
	"vagina",
	"wank",
	"whore",
	"wtf"
	]
	};

	var main = {};
	main.weak = new RegExp( '^'.concat(blacklist.weaks.join('\|^')), 'gi' );
	main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$\|^'), '$'), 'gi' );
	main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('\|^')), 'gi' );

	if (typeof module !== "undefined" && module.exports) {
	module.exports = main;
	}
	// nlp_comprimise by @spencermountain in 2014
	// most files are self-contained modules that optionally export for nodejs
	// this file loads them all together

	// if we're server-side, grab files, otherwise assume they're prepended already
	if (typeof module !== "undefined" && module.exports) {

	var parents = require("./src/parents/parents")

	var sentence_parser = require('./src/methods/tokenization/sentence').sentences;
	var tokenize = require('./src/methods/tokenization/tokenize').tokenize;
	var ngram = require('./src/methods/tokenization/ngram').ngram;
	//tokenize
	var normalize = require('./src/methods/transliteration/unicode_normalisation')
	var syllables = require('./src/methods/syllables/syllable');
	//localization
	var local = require('./src/methods/localization/britishize')
	var americanize = local.americanize;
	var britishize = local.britishize;
	//part of speech tagging
	var pos = require('./src/pos');
	//named_entity_recognition
	var spot = require('./src/spot');
	//weak verbs, vulgar words etc. TODO - goes to metrics ...
	var bl = require('./src/data/blacklist');
	}

	///
	// api
	var nlp = {
	noun: parents.noun,
	adjective: parents.adjective,
	verb: parents.verb,
	adverb: parents.adverb,
	value: parents.value,

	sentences: sentence_parser,
	ngram: ngram,
	tokenize: tokenize,
	americanize: americanize,
	britishize: britishize,
	syllables: syllables,
	normalize: normalize.normalize,
	denormalize: normalize.denormalize,
	pos: pos,
	spot: spot,
	blacklist: bl
	// tests: tests,
	};

	//export it for server-side
	if (typeof module !== "undefined" && module.exports) {
	module.exports = nlp;
	}

	// bump bower
	// git tag -a v0.3.5 -m "tag bower release"
	// git push origin master --tags

	// console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() )
	// console.log( nlp.pos('i will slouch').to_past().text() )