Skip to content

Instantly share code, notes, and snippets.

@bmcminn
Last active March 3, 2020 21:32
Show Gist options
  • Save bmcminn/836ad1b7988773ac7a8ced0bb57b955e to your computer and use it in GitHub Desktop.
Save bmcminn/836ad1b7988773ac7a8ced0bb57b955e to your computer and use it in GitHub Desktop.
WIP: This script makes some sweeping generalizations about your page content, but gives you a good rule of thumb approach to determining prevalence of certain keywords in your writing.

Keyword Density Estimator

This is an overly simplistic and highly opinionated concept of keyword density and does not reflect any actual research into the nuances of SEO content optimization. You have been warned.

Getting started

Eventually this will be an NPM module, so things like npm i keyword-checker will work :P

Settings

  • topWordCount: Returns the N most used words in your content. (default: 0)
  • ignore: An array list of words to ignore in the calculations. (default: [])
  • ignoreDefaults: Flag to ignore the base ignore words list. (default: false`)

Usage

Here is a basic use-case where-in I read a sample file and parse its contents to determine its keyword density.

// devise our callback for when we read the file
function parseFileKeywords(err, content) {
    if (err) {
        console.error(err);
        return;
    }

    // run the keyword parser on our file contents
    var keywordsList = keywords(content, {
        ignore: [
            'word'
        ,   'pants'
        ,   'html5'
        ,   '2013'
        ,   'after'
        ]
    });

    // echo the keyword stats to the console
    keywordsList.map(function(word) {
        console.log(JSON.stringify(word));
    });
}


// define require modules
var fs          = require('fs')
,   keywords    = require('./keywords')
;


var file = fs.readFile('./readme.md', { encoding: 'utf8' }, parseFileKeywords);

TODO

  • Come up with a better name for this module
var _ = require('lodash')
, chalk = require('chalk')
;
var keywordAnalysis = function keywordAnalysis(content, options) {
// configure options
options = options || {};
var BASE_OPTIONS = {
topWordCount: 0 // returns top N words from parsed list, 0 for all words
, ignore: [] // array list of words to ignore (appended to end of default ignore list)
, ignoreDefaults: false // forces module to only use custom ignore word list
}
;
// compose new options object with overrides
options = Object.assign({}, BASE_OPTIONS, options);
// configure ignored words list
var BASE_IGNORE_WORDS = [
'a'
, 'an'
, 'and'
, 'are'
, 'as'
, 'at'
, 'be'
, 'buy'
, 'by'
, 'can'
, 'cant'
, 'didnt'
, 'do'
, 'does'
, 'dont'
, 'each'
, 'else'
, 'etc'
, 'for'
, 'did'
, 'from'
, 'gave'
, 'get'
, 'h1'
, 'h2'
, 'h3'
, 'h4'
, 'h5'
, 'h6'
, 'h7'
, 'has'
, 'have'
, 'he'
, 'how'
, 'if'
, 'in'
, 'is'
, 'isnt'
, 'it'
, 'its'
, 'me'
, 'most'
, 'much'
, 'my'
, 'not'
, 'of'
, 'on'
, 'or'
, 'other'
, 'our'
, 'ours'
, 'out'
, 'own'
, 'p'
, 'put'
, 'puts'
, 'set'
, 'she'
, 'should'
, 'so'
, 'some'
, 'span'
, 't'
, 'than'
, 'that'
, 'the'
, 'their'
, 'theirs'
, 'then'
, 'they'
, 'theyre'
, 'to'
, 'to'
, 'us'
, 'use'
, 'via'
, 'was'
, 'we'
, 'were'
, 'what'
, 'who'
, 'whose'
, 'wont'
, 'you'
, 'your'
, 'youre'
]
;
// if we're overriding the ignore word list
if (options.ignore.length > 0
&& options.ignoreDefaults) {
BASE_IGNORE_WORDS = [];
}
// merge our ignore lists
options.ignore = BASE_IGNORE_WORDS.concat(options.ignore);
/**
* removes duplicate ensuring unique values
* @sauce http://stackoverflow.com/a/1961068
* @return {Array} { description_of_the_return_value }
*/
Array.prototype.uniq = function() {
var u = {}, a = [];
for (var i = 0, l = this.length; i < l; ++i) {
if(u.hasOwnProperty(this[i])) {
continue;
}
a.push(this[i]);
u[this[i]] = 1;
}
return a;
}
// strip duplicate words
options.ignore = options.ignore.uniq();
// we always need to ignore empty "words" upfront
options.ignore.unshift('', '');
options.ignore.push('');
// convert our ignored word list to a string with each word wrapped by a pipe (|)
// the pipe (|) makes string partial checks much more accurate to avoid greedy matches
// (ex: |use|user| === |use| vs |user|)
options.ignore = options.ignore.join('|');
// parse content file
var contentWords = content
.replace(/[\s\t\\\/]{1,}/gi, ' ') // normalize white space
.replace(/(\S)-(\S)/gi, '$1 $2') // break up hyphenated words
.replace(/[(){}[\]+\—\-_!@#$%^&*.,?'":“”‘’><;`~]/gi, '')
.replace(/https?:\/\/\S+\//gi, '')
.replace(/\s{2,}/gi, ' ')
.split(' ')
;
var wordCounts = {};
// for each word in our content
for (var i = contentWords.length-1; i>=0; i--) {
var word = contentWords[i].toLowerCase();
// ignore it if it's in our ignore list
if (options.ignore.indexOf('|' + word + '|') > -1) {
continue;
}
// add the word to our index if it doesn't exist already
if (!wordCounts[word]) {
wordCounts[word] = {
word: word
, count: 0
};
}
// increment word count
wordCounts[word].count++;
}
// determine actual word density and relative word density per ignore list
_.map(wordCounts, function(word) {
var count = word.count;
word.density = Math.round((count / contentWords.length) * 100, 2) + '%';
word.relativeDensity = Math.round((count / _.size(wordCounts)) * 100, 2) + '%';
return word;
});
// sort our data
wordCounts = _.reverse(_.sortBy(wordCounts, ['count']));
if (options.topWordCount > 0) {
wordCounts = _.slice(wordCounts, 0, options.topWordCount);
}
return wordCounts;
};
module.exports = exports = keywordAnalysis;
/**
* [countSyllables description]
* @sauce https://github.com/EndaHallahan/syllabificate/blob/master/index.js
* @param {string} inString [description]
* @return {string} [description]
*/
function countSyllables(inString) {
let syllablesTotal = 0;
let wordList = inString.match(/(?:(?:\w-\w)|[\wÀ-ÿ'’])+/g);
if (wordList) {wordList.forEach((word) => {
if (word === "'"||word==="’") {return;} //bandaid solution.
if (word.length <= 2) {syllablesTotal += 1; return;} //quick return on short words
let syllables = 0;
if (word.endsWith("s'")||word.endsWith("s’")) {word.slice(-1);} //ending with s'
if (word.endsWith("s's")||word.endsWith("s’s")) {word.slice(-1,-3);} //ending with s's
const cEndings = word.match(/(?<=\w{3})(side|\wess|(?<!ed)ly|ment|ship|board|ground|(?<![^u]de)ville|port|ful(ly)?|berry|box|nesse?|such|m[ae]n|wom[ae]n|anne)s?$/mi);
if (cEndings) {word = word.replace(cEndings[0],"\n" + cEndings[0]);} //Splits into two words and evaluates them as such
const cBeginnings = word.match(/^(ware|side(?![sd]$)|p?re(?!ach|agan|al|au)|[rf]ace(?!([sd]|tte)$)|place[^nsd])/mi);
if (cBeginnings) {word = word.replace(cBeginnings[0],""); syllables++;}
const esylp = word.match(/ie($|l|t|rg)|([cb]|tt|pp)le$|phe$|kle(s|$)|[^n]scien|sue|aybe$|[^aeiou]shed|[^lsoai]les$|([^e]r|g)ge$|(gg|ck|yw|etch)ed$|(sc|o)he$|seer|^re[eiuy]/gmi);
if (esylp) {syllables += esylp.length;} //E clustered positive
const esylm = word.match(/every|some([^aeiouyr]|$)|[^trb]ere(?!d|$|o|r|t|a[^v]|n|s|x)|[^g]eous|niet/gmi);
if (esylm) {syllables -= esylm.length;} //E clustered negative
const isylp = word.match(/rie[^sndfvtl]|(?<=^|[^tcs]|st)ia|siai|[^ct]ious|quie|[lk]ier|settli|[^cn]ien[^d]|[aeio]ing$|dei[tf]|isms?$/gmi);
if (isylp) {syllables += isylp.length;} //I clustered positive
const osylp = word.match(/nyo|osm(s$|$)|oinc|ored(?!$)|(^|[^ts])io|oale|[aeiou]yoe|^m[ia]cro([aiouy]|e)|roe(v|$)|ouel|^proa|oolog/gmi);
if (osylp) {syllables += osylp.length;} //O clustered positive
const osylm = word.match(/[^f]ore(?!$|[vcaot]|d$|tte)|fore|llio/gmi);
if (osylm) {syllables -= osylm.length;} //O clustered negative
const asylp = word.match(/asm(s$|$)|ausea|oa$|anti[aeiou]|raor|intra[ou]|iae|ahe$|dais|(?<!p)ea(l(?!m)|$)|(?<!j)ean|(?<!il)eage/gmi);
if (asylp) {syllables += asylp.length;} //A clustered positive
const asylm = word.match(/aste(?!$|ful|s$|r)|[^r]ared$/gmi);
if (asylm) {syllables -= asylm.length;} //A clustered negative
const usylp = word.match(/uo[^y]|[^gq]ua(?!r)|uen|[^g]iu|uis(?![aeiou]|se)|ou(et|ille)|eu(ing|er)|uye[dh]|nuine|ucle[aeiuy]/gmi);
if (usylp) {syllables += usylp.length;} //U clustered positive
const usylm = word.match(/geous|busi|logu(?!e|i)/gmi);
if (usylm) {syllables -= usylm.length;} //U clustered negative
const ysylp = word.match(/[ibcmrluhp]ya|nyac|[^e]yo|[aiou]y[aiou]|[aoruhm]ye(tt|l|n|v|z)|pye|dy[ae]|oye[exu]|lye[nlrs]|olye|aye(k|r|$|u[xr]|da)|saye\w|iye|wy[ae]|[^aiou]ying/gmi);
if (ysylp) {syllables += ysylp.length;} //Y clustered positive
const ysylm = word.match(/arley|key|ney$/gmi);
if (ysylm) {syllables -= ysylm.length;}
const essuffix = word.match(/((?<!c[hrl]|sh|[iszxgej]|[niauery]c|do)es$)/gmi);
if (essuffix) {syllables--;}//es suffix
const edsuffix = word.match(/([aeiouy][^aeiouyrdt]|[^aeiouy][^laeiouyrdtbm]|ll|bb|ield|[ou]rb)ed$|[^cbda]red$/gmi);
if (edsuffix) {syllables--}
const csylp = word.match(/chn[^eai]|mc|thm/gmi);
if (csylp) {syllables += csylp.length;} //Consonant clustered negative
const eVowels = word.match(/[aiouy](?![aeiouy])|ee|e(?!$|-|[iua])/gmi);
if (eVowels) {syllables += eVowels.length;} //Applicable vowel count (all but e at end of word)
if (syllables <= 0) {syllables = 1;} //catch-all
if (word.match(/[^aeiou]n['’]t$/i)) {syllables ++;} //ending in n't, but not en't
if (word.match(/en['’]t$/i)) {syllables --;} //ending in en't
syllablesTotal += syllables;
});}
return syllablesTotal;
}
/**
* [countPolys description]
* @sauce https://github.com/EndaHallahan/syllabificate/blob/master/index.js
* @param {string} inString [description]
* @return {string} [description]
*/
function countPolys(inString) {
let polysTotal = 0;
let wordList = inString.match(/(?:(?:\w-\w)|[\wÀ-ÿ'’])+/g);
if (wordList) {wordList.forEach((word) => {
if (word === "'"||word==="’") {return;} //bandaid solution.
if (word.length <= 3) {return;} //quick return on short words
let syllables = 0;
if (word.endsWith("s'")||word.endsWith("s’")) {word.slice(-1);} //ending with s'
if (word.endsWith("s's")||word.endsWith("s’s")) {word.slice(-1,-3);} //ending with s's
const cEndings = word.match(/(?<=\w{3})(side|\wess|(?<!ed)ly|ment|ship|board|ground|(?<![^u]de)ville|port|ful(ly)?|berry|box|nesse?|such|m[ae]n|wom[ae]n|horse|anne)s?$/mi);
if (cEndings) {word = word.replace(cEndings[0],"\n" + cEndings[0]);} //Splits into two words and evaluates them as such
const cBeginnings = word.match(/^(ware|side|p?re(?!ach|agan|al|au))/mi);
if (cBeginnings) {word = word.replace(cBeginnings[0],""); syllables++;}
const esylp = word.match(/ie($|l|t|rg)|([cb]|tt|pp)le$|phe$|kle(s|$)|[^n]scien|sue|aybe$|[^aeiou]shed|[^lsoai]les$|([^e]r|g)ge$|(gg|ck|yw|etch)ed$|(sc|o)he$|seer|^re[eiuy]/gmi);
if (esylp) {syllables += esylp.length;} //E clustered positive
const esylm = word.match(/every|some([^aeiouyr]|$)|[^trb]ere(?!d|$|o|r|t|a[^v]|n|s|x)|[^g]eous|niet/gmi);
if (esylm) {syllables -= esylm.length;} //E clustered negative
const isylp = word.match(/rie[^sndfvtl]|(?<=^|[^tcs]|st)ia|siai|[^ct]ious|quie|[lk]ier|settli|[^cn]ien[^d]|[aeio]ing$|dei[tf]|isms?$/gmi);
if (isylp) {syllables += isylp.length;} //I clustered positive
const osylp = word.match(/nyo|osm(s$|$)|oinc|ored(?!$)|(^|[^ts])io|oale|[aeiou]yoe|^m[ia]cro([aiouy]|e)|roe(v|$)|ouel|^proa|oolog/gmi);
if (osylp) {syllables += osylp.length;} //O clustered positive
const osylm = word.match(/[^f]ore(?!$|[vcaot]|d$|tte)|fore|llio/gmi);
if (osylm) {syllables -= osylm.length;} //O clustered negative
const asylp = word.match(/asm(s$|$)|ausea|oa$|anti[aeiou]|raor|intra[ou]|iae|ahe$|dais|(?<!p)ea(l(?!m)|$)|(?<!j)ean|(?<!il)eage/gmi);
if (asylp) {syllables += asylp.length;} //A clustered positive
const asylm = word.match(/aste(?!$|ful|s$|r)|[^r]ared$/gmi);
if (asylm) {syllables -= asylm.length;} //A clustered negative
const usylp = word.match(/uo[^y]|[^gq]ua(?!r)|uen|[^g]iu|uis(?![aeiou]|se)|ou(et|ille)|eu(ing|er)|uye[dh]|nuine|ucle[aeiuy]/gmi);
if (usylp) {syllables += usylp.length;} //U clustered positive
const usylm = word.match(/geous|busi|logu(?!e|i)/gmi);
if (usylm) {syllables -= usylm.length;} //U clustered negative
const ysylp = word.match(/[ibcmrluhp]ya|nyac|[^e]yo|[aiou]y[aiou]|[aoruhm]ye(tt|l|n|v|z)|pye|dy[ae]|oye[exu]|lye[nlrs]|olye|aye(k|r|$|u[xr]|da)|saye\w|iye|wy[ae]|[^aiou]ying/gmi);
if (ysylp) {syllables += ysylp.length;} //Y clustered positive
const ysylm = word.match(/arley|key|ney$/gmi);
if (ysylm) {syllables -= ysylm.length;}
const essuffix = word.match(/((?<!c[hrl]|sh|[iszxgej]|[niauery]c|do)es$)/gmi);
if (essuffix) {syllables--;}//es suffix
const edsuffix = word.match(/([aeiouy][^aeiouyrdt]|[^aeiouy][^laeiouyrdtbm]|ll|bb|ield|[ou]rb)ed$|[^cbda]red$/gmi);
if (edsuffix) {syllables--}
const csylp = word.match(/chn[^eai]|mc|thm/gmi);
if (csylp) {syllables += csylp.length;} //Consonant clustered negative
const eVowels = word.match(/[aiouy](?![aeiouy])|ee|e(?!$|-|[iua])/gmi);
if (eVowels) {syllables += eVowels.length;} //Applicable vowel count (all but e at end of word)
if (syllables <= 0) {syllables = 1;} //catch-all
if (word.match(/[^aeiou]n['’]t$/i)) {syllables ++;} //ending in n't, but not en't
if (word.match(/en['’]t$/i)) {syllables --;} //ending in en't
if (syllables >= 3) {polysTotal++;}
});}
return polysTotal;
}
function countLetters(text) {
return text.replace(/[^\w\d]/gi, '').length
}
function countWords(text) {
return text.replace(/\s+/g, '\s').split('\s').length
}
function countSentences(text) {
return text.split('.').length
}
function countComplexWords(text) {
let words = text.replace(/\s+/g, '\s').split('\s')
// words of 3+ syllables, not including common suffixes (-es, -ed, -ing)
return words.filter(el => countSyllables(el) > 2).length
}
/**
* [ARI description]
* @sauce https://en.wikipedia.org/wiki/Automated_readability_index
* @param {string} sentences [description]
*/
function ARI(text) {
let letters = countLetters(text)
let words = countWords(text)
let sentences = countSentences(text)
const A = (letters / words)
const B = (words / sentences)
const score = Math.round((4.71 * A) + (0.5 * B) - 21.43)
const grade = score
return [ score, grade ]
}
/**
* [FleschKinkaid description]
* @sauce https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
* @param {string} syllables [description]
* @param {string} words [description]
* @param {string} sentences [description]
*/
function FleschKinkaid(text) {
let words = countWords(text)
let sentences = countSentences(text)
let syllables = countSyllables(text)
const A = (words / sentences)
const B = (syllables / words)
const score = Math.round(206.835 - (1.015 * A) - (84.6 * B))
let grade = 0
if (100 > score && score >= 90) { grade = 5 }
else if ( 90 > score && score >= 80) { grade = 6 }
else if ( 80 > score && score >= 70) { grade = 7 }
else if ( 70 > score && score >= 65) { grade = 8 }
else if ( 65 > score && score >= 60) { grade = 9 }
else if ( 60 > score && score >= 57) { grade = 10 }
else if ( 57 > score && score >= 53) { grade = 11 }
else if ( 53 > score && score >= 50) { grade = 12 }
else if ( 50 > score && score >= 40) { grade = 13 }
else if ( 40 > score && score >= 30) { grade = 14 }
else if ( 30 > score && score >= 20) { grade = 15 }
else if ( 20 > score && score >= 10) { grade = 16 }
else if ( 10 > score && score >= 0) { grade = 17 }
return [ score, grade ]
}
/**
* [GunningFogIndex description]
* https://en.wikipedia.org/wiki/Gunning_fog_index
* @param {string} words [description]
* @param {string} complexWords [description]
* @param {string} sentences [description]
*/
function GunningFogIndex(text) {
let sentences = countSentences(text)
let words = countWords(text)
let complexWords = countComplexWords(text)
// TODO: resolve missing criteria of max 100 words with complete sentences
const A = (words / sentences)
const B = (complexWords / words)
const score = Math.round(0.4 * (A + (100 * B)))
const grade = score
return [ score, grade ]
}
/**
* [SMOG description]
* @sauce https://en.wikipedia.org/wiki/SMOG
* @param {string} text [description]
*/
function SMOG(text) {
const polysyllables = countPolys(text)
const sentences = countSentences(text)
// TODO: resolve missing criteria of max 100 words with complete sentences
const A = (30 / sentences)
const score = Math.round(1.0430 * Math.sqrt(polysyllables * A) + 3.1291)
const grade = score
return [ score, grade ]
}
/**
* [FryIndex description]
* @sauce https://en.wikipedia.org/wiki/Fry_readability_formula
* @param {string} text [description]
*/
function FryIndex(text) {
let letters = countLetters(text)
let words = countWords(text)
let sentences = countSentences(text)
const L = (letters / words * 100)
const S = (sentences / words * 100)
// TODO: implement this algorithm, need some calculous to derive the overlaps
const score = null
const grade = score
return [ score, grade ]
}
/**
* [ColemanLiauIndex description]
* @sauce https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
* @param {string} text [description]
*/
function ColemanLiauIndex(text) {
let letters = countLetters(text)
let words = countWords(text)
let sentences = countSentences(text)
const L = (letters / words * 100)
const S = (sentences / words * 100)
const score = Math.round( (0.0588 * L) - (0.296 * S) - 15.8 )
const grade = score
return [ score, grade ]
}
var text = "The automated readability index (ARI) is a readability test for English texts, designed to gauge the understandability of a text. Like the Flesch–Kincaid grade level, Gunning fog index, SMOG index, Fry readability formula, and Coleman–Liau index, it produces an approximate representation of the US grade level needed to comprehend the text."
console.log('ARI :', ARI(text))
console.log('FleschKinkaid :', FleschKinkaid(text))
console.log('GunningFogIndex :', GunningFogIndex(text))
console.log('SMOG :', SMOG(text))
console.log('ColemanLiauIndex :', ColemanLiauIndex(text))
console.log('FryIndex :', FryIndex(text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment