Skip to content

Instantly share code, notes, and snippets.

@dsottimano
Last active October 29, 2019 16:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dsottimano/2af808b30de1bec51506e0855141dceb to your computer and use it in GitHub Desktop.
Save dsottimano/2af808b30de1bec51506e0855141dceb to your computer and use it in GitHub Desktop.
Keyword frequency table generator - apps script - google sheets
/**
* Returns a table of ngrams and their importance
*
* @param {"cars are the best"} textArray REQUIRED The corpus you want statistics from
* @param {"3"} numberOccurances OPTIONAL Show results with at least X occurrences. Default is 2
* @param {"4"} numberOfWords OPTIONAL Show statistics for one to X words. Default is 5
* @param {"false"} removeStopWords OPTIONAL true or false. False by default
* @customfunction
*/
function KEYWORD_FREQUENCY_TABLE(textArray,numberOccurances,numberOfWords,removeStopWords) {
var text = ''
try {
if (textArray.map) textArray = textArray.flat(Infinity).join('').toString()
removeStopWords ? text = remove_stopwords(textArray) : text = textArray
var atLeast = numberOccurances || 2; // Show results with at least .. occurrences
var numWords = numberOfWords || 5; // Show statistics for one to .. words
var ignoreCase = true; // Case-sensitivity
var REallowedChars = /[^a-zA-Z'\-]+/g;
// RE pattern to select valid characters. Invalid characters are replaced with a whitespace
var i, j, k, textlen, len, s;
// Prepare key hash
var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty
var results = [];
numWords++; //for human logic, we start counting at 1 instead of 0
for (i = 1; i <= numWords; i++) {
keys.push({});
}
// Remove all irrelevant characters
text = text.replace(REallowedChars, " ").replace(/^\s+/, "").replace(/\s+$/, "");
// Create a hash
if (ignoreCase) text = text.toLowerCase();
text = text.split(/\s+/);
for (i = 0, textlen = text.length; i < textlen; i++) {
s = text[i];
keys[1][s] = (keys[1][s] || 0) + 1;
for (j = 2; j <= numWords; j++) {
if (i + j <= textlen) {
s += " " + text[i + j - 1];
keys[j][s] = (keys[j][s] || 0) + 1;
} else break;
}
}
// Prepares results for advanced analysis
for (var k = 1; k <= numWords; k++) {
results[k] = [];
var key = keys[k];
for (var i in key) {
if (key[i] >= atLeast) results[k].push({
"word": i,
"count": key[i]
});
}
}
// Result parsing
var outputHTML = [];
var f_sortAscending = function (x, y) {
return y.count - x.count;
};
for (k = 1; k < numWords; k++) {
results[k].sort(f_sortAscending); //sorts results
// Customize your output. For example:
var words = results[k];
if (words.length) {
if (k>1) outputHTML.push([,,,])
outputHTML.push([k + ' word' + (k == 1 ? "" : "s"),"Count","Relativity"]);
outputHTML.push([,,,])
}
for (i = 0, len = words.length; i < len; i++) {
outputHTML.push([words[i].word, words[i].count ,Math.round(words[i].count / textlen * 10000) / 100]);
}
}
if (outputHTML.length < 1) return "Sorry, not enough data"
return outputHTML
} catch(e) {
Logger.log(e)
return e
}
}
//https://stackoverflow.com/a/57153507/2121455
function remove_stopwords(str) {
var stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now']
var str = str.toString()
res = []
words = str.split(' ')
for(i=0;i<words.length;i++) {
if(!stopwords.includes(words[i])) {
res.push(words[i])
}
}
return(res.join(' '))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment