Skip to content

Instantly share code, notes, and snippets.

@redaktor
Last active February 12, 2024 23:18
Show Gist options
  • Save redaktor/e5866669e238221e7cef to your computer and use it in GitHub Desktop.
Save redaktor/e5866669e238221e7cef to your computer and use it in GitHub Desktop.
nlp_compromise metrics proposal as standalone example
var blacklist = {
weaks: [
"be",
"am",
"is",
"are",
"wa",
"were",
"been",
"have",
"do",
"say",
"go",
"see",
"give",
"know",
"want",
"put",
"seem",
"stay",
"speak",
"find",
"come",
"think",
"leav",
"take",
"feel",
"watch",
"begin",
"hope",
"exist",
"work",
"produc",
"occur",
"understand",
"receiv",
"appear",
"serv",
"need",
"maintain",
"chang",
"introduc",
"creat",
"open",
"consider",
"hear",
"finish",
"convert",
"form",
"bring",
"achiev",
"suppos",
"get",
"got",
"reach",
"run",
"ran",
"use",
"help",
"show",
"move",
"happen",
"fix",
"set"
],
fillers: [
"absolutely",
"actual",
"actually",
"anyway",
"apparently",
"approximately",
"badly",
"basically",
"begin",
"certainly",
"clearly",
"completely",
"definitely",
"easily",
"effectively",
"entirely",
"especially",
"essentially",
"exactly",
"extremely",
"fairly",
"frankly",
"frequently",
"fully",
"generally",
"hardly",
"heavily",
"highly",
"hopefully",
"just",
"largely",
"like",
"literally",
"maybe",
"might",
"most",
"mostly",
"much",
"necessarily",
"nicely",
"obviously",
"ok",
"okay",
"particularly",
"perhaps",
"possibly",
"practically",
"primarily",
"probably",
"precisely",
"quite",
"rather",
"real",
"really",
"relatively",
"right",
"seriously",
"significantly",
"simply",
"slightly",
"so",
"specifically",
"start",
"strongly",
"surely",
"too",
"totally",
"truly",
"try",
"typically",
"ultimately",
"usually",
"very",
"virtually",
"whatever",
"well",
"whenever",
"wherever",
"whoever",
"widely"
],
vulgars: [
"anal",
"anus",
"arabush",
"arse",
"arsehole",
"ass",
"asshole",
"ballsack",
"balls",
"bastard",
"bitch",
"biatch",
"bloody",
"blowjob",
"blow job",
"bluegum",
"bollock",
"bollok",
"boner",
"boob",
"bugger",
"bum",
"butt",
"buttcrack",
"buttplug",
"chinaman",
"clit",
"clitoris",
"cock",
"cocksucker",
"coon",
"crap",
"cunt",
"damn",
"dick",
"dickhead",
"dildo",
"dyke",
"fag",
"feck",
"fellate",
"fellatio",
"felching",
"fuck",
"fuckhead",
"f u c k",
"fudgepacker",
"fudge packer",
"flange",
"goddamn",
"gable",
"god damn",
"handjob",
"hell",
"homo",
"jerk",
"jizz",
"knobend",
"knob end",
"labia",
"lmao",
"lmfao",
"muff",
"nigger",
"nigga",
"niggar",
"omg",
"penis",
"piss",
"poop",
"prick",
"pube",
"pussy",
"queer",
"scrotum",
"shit",
"s hit",
"sh1t",
"slut",
"smegma",
"spunk",
"sucker",
"tit",
"tosser",
"turd",
"twat",
"vagina",
"wank",
"whore",
"wtf"
]
};
var main = {};
main.weak = new RegExp( '^'.concat(blacklist.weaks.join('|^')), 'gi' );
main.filler = new RegExp( '^'.concat(blacklist.fillers.join('$|^'), '$'), 'gi' );
main.vulgar = new RegExp( '^'.concat(blacklist.vulgars.join('|^')), 'gi' );
if (typeof module !== "undefined" && module.exports) {
module.exports = main;
}
// TODO - make logic_negate and abbreviations to lexicon as resource file (i18n, language aware, seperate data and logic)
// the best way might be a dictionary with flags where we can easily derive the lexicon by Object.keys and map, like
/* dictionary: {
"CP": [
{v:'is', weak: 1},
...
],
...
};
*/
var nlp = require('nlp_compromise');
var util = require('util');
//var TEST = 'The cats we saw, e.g. tigers or leopards, are nice. I am a perfect second sentence for them. This is actually not. We\'re exclamative! Let us look back. They were beaten.';
var TEST = 'He was told that they have been hardly wounded.'
/* TODO - options, like "optimize metrics for"
// example:
// TICKS, e.g. LONGSENTENCE <:
news / mobile 18
story / desktop 25
longread 30
scientific text 45
*/
/*
TODO - use important rules from the stylebooks of AP, APA (en) and dpa (de)
e.g.:
+ (related to dates) : ages:
For ages, always use figures. If the age is used as an adjective or as a substitute for a noun, then it should be hyphenated. Don't use apostrophes when describing an age range.
Examples: A 21-year-old student. The student is 21 years old. The girl, 8, has a brother, 11. The contest is for 18-year-olds. He is in his 20s.
Please note, that medical and political titles only need to be used on first reference when they appear outside of a direct quote.
For courtesy titles, use these on second reference or when specifically requested.
Other acronyms and abbreviations are acceptable but not required (i.e. FBI, CIA, GOP). The context should govern such decisions. Avoid "alphapet soup" ...
Use quotation marks around the titles of books, songs, television shows, computer games, poems, lectures, speeches and works of art.
Examples: Author Porter Shreve read from his new book, "When the White House Was Ours." They sang "The Star-Spangled Banner" before the game.
Do not use quotations around the names of magazine, newspapers, the Bible or books that are catalogues of reference materials.
Examples: The Washington Post first reported the story. He reads the Bible every morning.
When used with a date, abbreviate only the following months: Jan., Feb., Aug., Sept., Oct., Nov. and Dec.
*/
var c = {
LONGSENTENCE: 40,
SHORTSENTENCE: 5,
};
// TODO FIXME - should go to lexicon
/*
NOTE: better performance when we use the following additional tagging already when stemming:
'AUX': 'auxillary verbs'
'WDT': 'wh-determiner', // WHICH, WHAT, WHOSE
'WP': 'wh-pronoun', // WHICH, WHAT, WHO, WHOM
'WRB': 'wh-adverb', // HOW, WHEN, WHENCE, WHERE, WHY
'TO': 'to', // ?
'RP': 'Particle', // it would be useful if there is RPP for positive particles and RPN for negative
// and if there would be an "opposite" mapping ...
// note currently only "not" is handled and it stems as a "CC"
'LS': 'List item marker',
'PDT': 'Predeterminer',
'POS': 'Possessive ending',
'SYM': 'Symbol (mathematical or scientific)',
':': 'colon',
'(': 'open parenthesis',
'``': 'open quote',
"''": 'close quote',
'#': 'pound sign (currency marker)',
'$': 'dollar sign (currency marker)',
')': 'close parenthesis',
',': 'comma',
'.': 'period'
// ?
'WP$': 'Possessive wh-pronoun', // how about demonstrativePronouns ?
*/
// auxillary verbs
var auxVerbs = ['do', 'does', 'did', 'have', 'has', 'had', 'having', 'be', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'shall', 'will', 'should', 'would', 'can', 'could', 'may', 'might', 'must'];
// auxillary verbs and other verbs in verb groups;
var verbGroups = [
// first item is already known as any verb or auxVerb
// TODO better: pos_reason VB verb ed
{
aux: ['have', 'has', 'had', 'having'],
verbs: /(en$)|(ed$)/
},
{
aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'being', 'to be'],
verbs: /ing$/
},
{
aux: ['is', 'am', 'are', 'was', 'were', 'been', 'be', 'to be'],
verbs: /(en$)|(ed$)/
}
// last item SHOULD be a verb except auxVerbs or 'copula-adjective' - TODO - How to express in lexicon ?
];
// passive voice
var passiveVoiceAux = ["am", "is", "are", "was", "were", "be", "been", "being"];
// subset of determiners
var demonstrativePronouns = ['this', 'that', 'these', 'those', 'such', 'none', 'neither'];
var specialDemonstrativePronouns = ['this', 'that'];
var whDeterminers = ['which', 'what', 'whose'];
// other wh-stuff, see http://www.garfixia.nl/k/news/view/1442/15/the-what-why-and-how-of-wh-words.html
var whPronouns = ['which', 'what', 'who', 'whom'];
var whAdverbs = ['how', 'when', 'whence', 'where', 'why'];
// entity substitutions
var entitySubstitutions = ['it', 'he', 'him', 'she', 'her', 'i', 'me', 'we', 'us', 'they', 'them', 'you', 'there', 'here', 'thing', 'stuff', 'fact', 'this', 'that'];
// nominalizations
var nominalizationRe = new RegExp('(?:ion|ions|ism|isms|ty|ties|ment|ments|ness|nesses|ance|ances|ence|ences)$');
// end ^ TODO FIXME - should go to lexicon
// EXTEND ARRAY PROTOTYPE
Array.prototype.average = function() {
// TODO - in other contexts we MUST handle values other than typeof 'number' !!!
var r = {mean: 0, variance: 0, deviation: 0}, t = this.length;
for(var m, s = 0, l = t; l--; s += this[l]);
for(m = r.mean = s / t, l = t, s = 0; l--; s += Math.pow(this[l] - m, 2));
return r.deviation = Math.sqrt(r.variance = s / t), r;
};
Array.prototype.unique = function() {
return this.reduce(function(p, c) {
if (p.indexOf(c) < 0) p.push(c);
return p;
}, []);
};
Array.prototype.sequences = function() {
var lastI = -1;
var results = [[]];
var that = this;
this.forEach(function(i, j) {
if (i != lastI+1 && lastI>-1) results.push([]);
results[results.length - 1].push(i);
lastI = i;
});
return results;
};
function decimals(f, dec) {
// TODO, v2
// stub, currently used for toPercent which will become readable with
// decimals rounded and percent values and value/unit etc.
// we SHOULD round the 2nd decimal ...
if (!dec) dec = 2;
return parseFloat(f.toFixed(2));
}
function calculateMetrics(txt) {
var processed = nlp.pos(txt);
var metrics = {
sentenceCount: 0,
wordCount: 0,
characterCount: 0,
characterCountTrimmed: 0,
uselessBoundaries: 0,
vocabularySize: 0,
wordsPerSentence: 0,
wordsPerSentenceStd: -1,
longSentencesRatio: 0,
shortSentencesRatio: 0,
declarativeRatio: 0,
interrogativeRatio: 0,
exclamativeRatio: 0,
charactersPerWords: 0,
syllablesPerWord: 0,
negationsPerSentence: 0,
stopwordRatio: 0,
nounRatio: 0,
nounClusterRatio: 0,
pronounRatio: 0,
verbRatio: 0,
adjectiveRatio: 0,
adverbRatio: 0,
otherPosRatio: 0,
modalRatio: 0,
nominalizationRatio: 0,
entitySubstitutionRatio: 0,
weakVerbRatio: 0,
vulgarWordRatio: 0,
verbGroupsPerSentence: 0,
passiveVoicePerSentence: 0,
fillerRatio: 0,
readability: 0
};
var sentences = processed.sentences;
// count number of sentences
// sentenceCount
metrics.sentenceCount = sentences.length;
var stems = [];
var sentencesCounts = [];
var charactersPerWordsCounts = [];
var syllablesCount = 0;
var negationsCount = 0;
// depends on other nouns
var nounClusterCount = 0;
// depends on wordCount
var tCounts = {
noun: 0,
pronoun: 0,
pronounNonpossesive: 0,
verb: 0,
adverb: 0,
adjective: 0,
modalVerb: 0,
weakVerb: 0,
vulgarWord: 0,
filler: 0
};
// question: we have 1 minor issue with the TAGS:
// "CP" is a copula verb but a verb. We think it is e.g. different from noun/pronoun relation - SHOULD it be called VCP ???
var _types = { N: 'noun', P: 'pronoun', V: 'verb', C: 'verb', R: 'adverb', J: 'adjective', M: 'modalVerb' };
// for further calculation purposes
var data = {
nominalizations: [],
entitySubstitutions:[]
};
var nounCluster = function(token, _nounsCount) {
if (!_nounsCount || _nounsCount < 1) _nounsCount = token.normalised.match(/\S+/g).length;
// count clustered nouns (3 with possibly 'of')
var n = token.analysis.next;
if (n && _nounsCount < 10 && (n.pos.tag.slice(0,1) === 'N' || n.normalised === 'of')) {
if (n.normalised != 'of') _nounsCount++;
nounCluster(token, _nounsCount);
} else if (_nounsCount > 2) {
return _nounsCount;
} else {
return 0;
}
}
var verbGroupBegin = function(o) {
return (o.hasOwnProperty('pos') && o.analysis.next && (o.pos.parent === 'verb' || auxVerbs.indexOf(o.normalised) > -1));
}
var verbGroupEnd = function(o) {
return (o.hasOwnProperty('pos') && (o.pos.parent === 'verb' && auxVerbs.indexOf(o.normalised) < 0) || o.pos_reason === 'copula-adjective');
}
sentences.forEach(function(sentence, sI) {
//var sText = sentence.text();
console.log( '!s', sentence.text() );
sentences[sI].metrics = {};
// count number of words
// wordCount
data.nominalizations[sI] = [];
data.entitySubstitutions[sI] = [];
if (!(sentences[sI].hasOwnProperty('groupTokens'))) sentences[sI].metrics.groupTokens = [];
metrics.wordCount = metrics.wordCount+sentence.tokens.length;
// count verb groups
// handled rule group id and last group token
var l = 0;
var groupId = 0;
var last = {i:0};
var missingEnd = false;
stems = stems.concat(sentence.tokens.map(function(token, i){
if (!(sentences[sI].metrics.groupTokens.length)) sentences[sI].metrics.groupTokens.push([]);
l = (sentences[sI].metrics.groupTokens.length);
// count verb groups
if ( (!(last.i) || last.i < i) && verbGroupBegin(token)) {
// could be a normalized verb group
// note: does not cover phrasal verbs
var next = token.analysis.next;
var iNext = i+1;
verbGroups.every(function(group, gI) {
if (gI >= groupId) {
if ((group.aux.indexOf(next.normalised) > -1 || group.verbs.test(next.normalised) || next.pos_reason === 'copula-adjective')) {
groupId = gI;
sentences[sI].metrics.groupTokens[l-1].push(i);
sentences[sI].metrics.groupTokens[l-1].push(iNext);
last = sentence.tokens[iNext];
last.i = iNext;
return false;
}
}
});
}
console.log( last.i, i );
// seperate multiple verb groups TODO TEST - "special clusters"
l = (sentences[sI].metrics.groupTokens.length);
if (last.i === i && verbGroupEnd(token)) {
groupId = 0;
sentences[sI].metrics.groupTokens.push([]);
} else if (i > 0 && last.i != i && !verbGroupEnd(last)) {
console.log( 'hasEnd', verbGroupEnd(last), token.text );
if (verbGroupEnd(token)) {
console.log( 'Could be End: ', token.text );
sentences[sI].metrics.groupTokens[l-1].push(i);
groupId = 0;
sentences[sI].metrics.groupTokens.push([]);
}
}
//console.log(token.pos.tag, token.normalised, token.pos_reason/*, token*/);
// TODO - ISSUE with negation logic_negate just works in one direction FIXME CONTRIB
// test http://rawgit.com/spencermountain/nlp_compromise/master/client_side/basic_demo/index.html :
// example: joe never swims in the pool.
if (token.analysis.negative) negationsCount++;
//console.log( 'token: ', token );
data.entitySubstitutions[sI][i] = (token.normalised != 'i' && (entitySubstitutions.indexOf(token.normalised) > -1) && !(token.capitalised));
if (data.entitySubstitutions[sI][i] && specialDemonstrativePronouns.indexOf(token.normalised) > -1) {
if (token.analysis.last) {
var firsttwo = token.analysis.last.pos.tag.slice(0,2);
if (['NN', 'PR'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
}
if (token.analysis.next) {
var firsttwo = token.analysis.next.pos.tag.slice(0,2);
if (['NN', 'PR', 'JJ', 'DT'].indexOf(firsttwo) > -1) data.entitySubstitutions[sI][i] = false;
/*, 'WD', 'WP' // see above TODO, handled below*/
if (whDeterminers.concat(whPronouns).indexOf(token.normalised) > -1) data.entitySubstitutions[sI][i] = false;
}
}
if (data.entitySubstitutions[sI][i]) sentences[sI].tokens[i].metrics.entitySubstitution = true;
// count number of different parts of speech
var typeId = token.pos.tag.slice(0,1);
console.log( 'token: ', token.text, token.pos.tag, token.pos.parent, token.pos_reason );
//console.log( 'token3: ',typeId, _types[typeId] );
if (_types.hasOwnProperty(typeId)) tCounts[_types[typeId]]++;
// count characters per words
charactersPerWordsCounts.push(token.text.length);
data.nominalizations[sI][i] = false;
if (typeId === 'N') {
// count clustered nouns
var curClusterCount = nounCluster(token);
if (curClusterCount) nounClusterCount += curClusterCount;
// count nominalizations
var isNNP = token.pos.tag.indexOf('NNP' === 0);
if (isNNP) data.nominalizations[sI][i] = (token.text.length > 7) && (token.normalised.search(nominalizationRe));
}
if (data.nominalizations[sI][i]) sentences[sI].tokens[i].metrics.nominalization = true;
if (typeId === 'V') {
// count weak verbs
var check = (token.pos.tense === 'present') ? token.normalised : token.analysis.conjugate().infinitive;
if (nlp.blacklist.weak.test(check)) tCounts.weakVerb++;
}
// count vulgar words, fillers etc.
if (nlp.blacklist.vulgar.test(token.normalised)) tCounts.vulgarWord++;
if (nlp.blacklist.filler.test(token.normalised)) tCounts.filler++;
var syllables = nlp.syllables(token.text);
if (syllables) syllablesCount = syllablesCount + syllables.length;
return token.normalised;
}));
if (sentences[sI].metrics.groupTokens.length) sentences[sI].metrics.groupTokens = sentences[sI].metrics.groupTokens.filter(function(ts) {
return (ts.length);
});
if (sentences[sI].metrics.groupTokens.length) {
// we found verb groups ...
var readableTokens = sentences[sI].metrics.groupTokens.map(function(ts) {
return ts.map(function(tId) { return sentences[sI].tokens[tId].normalised }).join(' ');
});
sentences[sI].metrics.passiveVoiceTokens = [];
sentences[sI].metrics.groupTokens.forEach(function(ts, i) {
var isPassive = -1;
ts.forEach(function(tId) {
sentences[sI].tokens[tId].verbGroup = i;
console.log( sentences[sI].tokens[tId].pos_reason );
if (passiveVoiceAux.indexOf(sentences[sI].tokens[tId].normalised) > -1 ||
sentences[sI].tokens[tId].pos_reason === 'copula-adjective' ||
sentences[sI].tokens[tId].pos_reason === 'ed'
) isPassive++;
});
if (isPassive > 0) sentences[sI].metrics.passiveVoiceTokens.push(ts);
})
console.log( 'sentence end, verb groups raw: ', sentences[sI].metrics.groupTokens );
console.log( 'verb groups: ', readableTokens );
console.log( 'passive groups: ', sentences[sI].metrics.passiveVoiceTokens );
}
sentencesCounts.push(sentence.tokens.length);
});
// TODO - find phrasal verbs
if (sentencesCounts.length > 0) {
// count number of words per sentence and its standard deviation
if (metrics.sentenceCount) {
// wordsPerSentence
if (sentencesCounts.length > 0) metrics.wordsPerSentence = (sentencesCounts.reduce(function(a, b) { return a + b; })) / metrics.sentenceCount;
// negationsPerSentence
if (negationsCount) metrics.negationsPerSentence = negationsCount / metrics.sentenceCount;
}
if (sentences.length >= 10) {
// wordsPerSentenceStd
metrics.wordsPerSentenceStd = sentencesCounts.average().deviation;
}
}
// find extra long and short sentences
if (sentences.length) {
// longSentencesRatio
var _longs = sentencesCounts.filter(function(sCount){
if (sCount >= c.LONGSENTENCE) return 1;
});
metrics.longSentencesRatio = _longs.length / sentencesCounts.length;
// shortSentencesRatio
var _shorts = sentencesCounts.filter(function(sCount){
if (sCount <= c.SHORTSENTENCE) return 1;
});
metrics.shortSentencesRatio = _shorts.length / sentencesCounts.length;
if (metrics.sentenceCount) {
// count sentence types based on ending punctuation mark
// declarativeRatio, interrogativeRatio, exclamativeRatio
var types = sentences.map(function(s){ return s.type; });
['declarative', 'interrogative', 'exclamative'].forEach(function(type){
var typeCount = types.filter(function(v) { return v === type; }).length;
metrics[type.concat('Ratio')] = typeCount / metrics.sentenceCount;
});
}
}
// find vocabulary size
// vocabularySize
metrics.vocabularySize = stems.unique().length;
// count number of characters in the whole RAW text
// characterCount
var d = txt.trim();
metrics.characterCount = d.length;
var uselessBoundaries = d.match(/[\s\t]{2,}/g);
if (uselessBoundaries) {
var ub = uselessBoundaries.map(function(b) { return b.length; });
metrics.uselessBoundaries = ub.length;
metrics.characterCountTrimmed = d.length - (ub.reduce(function(a, b) { return a + b; }) - ub.length);
} else {
metrics.characterCountTrimmed = d.length;
}
// counts per sentence
if (metrics.sentenceCount) {
// count verb Groups
// verbGroupsPerSentence
var groupsCount = sentences.map(function(s){return s.metrics.groupTokens.length||0;}).reduce(function(a, b) {return a+b;});
metrics.verbGroupsPerSentence = groupsCount / metrics.sentenceCount;
// count passive voice cases
// passiveVoicePerSentence (special verb groups)
var passiveVoiceCount = sentences.map(function(s){return s.metrics.passiveVoiceTokens.length||0;}).reduce(function(a, b) {return a+b;});
metrics.passiveVoicePerSentence = passiveVoiceCount / metrics.sentenceCount;
}
// counts per word
if (metrics.wordCount) {
// count number of syllables per word
// syllablesPerWord
if (syllablesCount) metrics.syllablesPerWord = syllablesCount/metrics.wordCount;
// count number of characters per word
// charactersPerWords
if (charactersPerWordsCounts) metrics.charactersPerWords = (charactersPerWordsCounts.reduce(function(a, b) {return a+b;})) / metrics.wordCount;
// ratio for types of words, weak and vulgar words
['noun', 'pronoun', 'verb', 'adverb', 'adjective', 'modalVerb', 'weakVerb', 'vulgarWord', 'filler'].forEach(function(d) {
if (tCounts[d]) metrics[d.concat('Ratio')] = tCounts[d] / metrics.wordCount;
});
metrics.otherPosRatio = 1 - metrics.nounRatio - metrics.pronounRatio - metrics.verbRatio - metrics.adjectiveRatio - metrics.adverbRatio;
}
// counts per nouns
if (tCounts.noun) {
// nounRatio
if (nounClusterCount) metrics.nounClusterRatio = nounClusterCount / tCounts.noun;
// nominalizationRatio and entitySubstitutionRatio :
// TODO - make sure tCounts.noun contain what python NLT calls "pronoun_nonpossesive"
var nominCount = 0;
data.nominalizations.forEach(function(n, sI) { nominCount += n.filter(function(v){ return (v); }).length });
metrics.nominalizationRatio = nominCount / tCounts.noun;
var entitySubCount = 0;
data.entitySubstitutions.forEach(function(n, sI) { entitySubCount += n.filter(function(v){ return (v); }).length });
metrics.entitySubstitutionRatio = entitySubCount / tCounts.noun;
}
// estimate test readability using Flesch-Kincaid Grade Level test
// TODO short texts ...
if (/*(metrics.wordCount >= 100) &&*/ metrics.wordsPerSentence && metrics.syllablesPerWord) {
metrics.readability = 0.39 * metrics.wordsPerSentence + 11.8 * metrics.syllablesPerWord - 15.59;
}
// count number of stopwords
// stopwordRatio
/* TODO
+ Named-Entities (dynamic) !!!
? rare words / rareWordsRatio
//
# count number of stopwords
data['stopwords'] = [None] * len(tokens)
for idx, word in enumerate(words):
if word in stopset:
metrics['stopword_ratio'] += 1
data['stopwords'][word2token_map[idx]] = True
else:
data['stopwords'][word2token_map[idx]] = False
if metrics['wordCount']:
metrics['stopword_ratio'] /= metrics['wordCount']
# count rare words
if len(words):
metrics['rare_word_ratio'] = data['expected_word_frequencies'].count(0) / len(words)
else:
metrics['rare_word_ratio'] = 0
# count word, bigram, and trigram frequencies
// ...
// ???
# fix some verbs ending in -ing being counted as nouns
for idx, token in enumerate(tokens):
if (token[-3:] == 'ing') and (idx < len(tokens)) and (data['parts_of_speech'][idx+1] == 'IN'):
data['parts_of_speech'][idx] = 'VBG'
// ??? see below
# find auxiliary verbs
for i in range(verb_group_count):
verb_group_stack = [idx for idx in range(len(tokens)) if data['verb_groups'][idx] == i+1]
for j in verb_group_stack[:-1]:
auxiliary_verbs[j] = True
// ???
data['weak_verbs'][idx] = (data['parts_of_speech'][idx][:2] == 'VB') and (data['stems'][idx] in dict_weak_verbs)
if data['weak_verbs'][idx] and auxiliary_verbs[idx]:
data['weak_verbs'][idx] = False
*/
stems = null;
return metrics;
}
var toPercent = function(o) {
var percentMetrics = {};
for (var k in o) {
percentMetrics[k.replace('Ratio', 'Percent')] = decimals( (k.indexOf('Ratio')<0) ? o[k] : ((o[k]) ? o[k]*100 : 0) );
}
return percentMetrics;
}
var metrics = calculateMetrics(TEST);
console.log( metrics );
console.log( toPercent(metrics) );
/* appendix, reasoning
NN
"before a modal" //if it's before a modal verb, it's a noun -> lkjsdf would
"determiner-verb" //if it's after a determiner, it's not a verb -> the walk
"capitalised" //it has a capital and isn't first word
"need one verb" //if there no verb in the sentence, there needs to be.
VB
"after an adverb" //if it's after an adverb, it's not a noun -> quickly acked
"ed" //set ambiguous 'ed' endings as either verb/adjective
RB
"consecutive_adjectives" //no consecutive, unpunctuated adjectives -> real good
JJ
"copula-adjective" //copulas are followed by a determiner ("are a .."), or an adjective ("are good")
"copula-adverb-adjective" //copula, adverb, verb -> copula adverb adjective -> is very lkjsdf
UH
"wordless_string" //punctuation - like ' -- ' etc.
CD
"parsefloat" //see if it's a number
---
lex
"lexicon" //known words list
parts_of_speech[wordnet_suffixes[suffix]]
"wordnet suffix" //suffix pos signals from wordnet
r
"regex suffix" // suffix regexes for words
// + last pass
sentence.tokens = sentence.tokens.map(function(token, i) {
var next = sentence.tokens[i + 1]
var prev = sentence.tokens[i - 1]
if (token.pos) {
//suggest noun after determiners (a|the), posessive pronouns (her|my|its)
if (token.pos.tag == "DT" || token.pos.tag == "PP") {
need = 'NN'
reason = token.pos.name
}
//suggest verb after personal pronouns (he|she|they), modal verbs (would|could|should)
if (token.pos.tag == "PRP" || token.pos.tag == "MD") {
need = 'VB'
reason = token.pos.name
}
}
if (need && !token.pos) {
token.pos = parts_of_speech[need]
token.pos_reason = "signal from " + reason
}
if (need == 'VB' && token.pos.parent == 'verb') {
need = null
}
if (need == 'NN' && token.pos.parent == 'noun') {
need = null
}
return token
})
*/
// nlp_comprimise by @spencermountain in 2014
// most files are self-contained modules that optionally export for nodejs
// this file loads them all together
// if we're server-side, grab files, otherwise assume they're prepended already
if (typeof module !== "undefined" && module.exports) {
var parents = require("./src/parents/parents")
var sentence_parser = require('./src/methods/tokenization/sentence').sentences;
var tokenize = require('./src/methods/tokenization/tokenize').tokenize;
var ngram = require('./src/methods/tokenization/ngram').ngram;
//tokenize
var normalize = require('./src/methods/transliteration/unicode_normalisation')
var syllables = require('./src/methods/syllables/syllable');
//localization
var local = require('./src/methods/localization/britishize')
var americanize = local.americanize;
var britishize = local.britishize;
//part of speech tagging
var pos = require('./src/pos');
//named_entity_recognition
var spot = require('./src/spot');
//weak verbs, vulgar words etc. TODO - goes to metrics ...
var bl = require('./src/data/blacklist');
}
///
// api
var nlp = {
noun: parents.noun,
adjective: parents.adjective,
verb: parents.verb,
adverb: parents.adverb,
value: parents.value,
sentences: sentence_parser,
ngram: ngram,
tokenize: tokenize,
americanize: americanize,
britishize: britishize,
syllables: syllables,
normalize: normalize.normalize,
denormalize: normalize.denormalize,
pos: pos,
spot: spot,
blacklist: bl
// tests: tests,
};
//export it for server-side
if (typeof module !== "undefined" && module.exports) {
module.exports = nlp;
}
// bump bower
// git tag -a v0.3.5 -m "tag bower release"
// git push origin master --tags
// console.log( nlp.pos('she sells seashells by the seashore').sentences[0].negate().text() )
// console.log( nlp.pos('i will slouch').to_past().text() )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment