Skip to content

Instantly share code, notes, and snippets.

@psinger
Created September 19, 2015 22:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psinger/a65b00daf31dc66fa4fa to your computer and use it in GitHub Desktop.
Save psinger/a65b00daf31dc66fa4fa to your computer and use it in GitHub Desktop.
function features(row, emit) {
text = row.body
text_no_url = text
url_count = 0
matcher = text.match(/(https:[\/][\/]|http:[\/][\/]|www.)[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?\/?([a-zA-Z0-9\-\._\?\,\'\/\\\+&%\$#\=~])*/g)
if (matcher != null) {
text_no_url = text.replace(/(https:[\/][\/]|http:[\/][\/]|www.)[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?\/?([a-zA-Z0-9\-\._\?\,\'\/\\\+&%\$#\=~])*/g, "")
}
emit({
flesch_reading_ease: flesch_reading_ease(text_no_url),
flesch_kincaid_grade: flesch_kincaid_grade(text_no_url),
smog_index: smog_index(text_no_url),
gunning_fog_index: gunning_fog_index(text_no_url)
});
}
function word_count(s) {
counter = 0
words = s.split(" ")
for (i=0; i<words.length;i++) {
if (words[i].replace(/^\W+|\W+$/gm,'').length != 0) {
counter += 1
}
}
return counter
}
function sentence_count(s) {
counter = 0
sentences = s.split(/ *[\.\?!][\'"\)\]]* +/)
for (i=0; i<sentences.length;i++) {
if (sentences[i].length != 0) {
counter += 1
}
}
return counter
}
// http://eayd.in/?p=232
function syllable_count_word(s) {
var word = s
var exception_add = ['serious','crucial'];
var exception_del = ['fortunately','unfortunately'];
var co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court'];
var co_two = ['coapt','coed','coinci'];
var pre_one = ['preach'];
var syls = 0;
var disc = 0;
if (word.length <= 3) {
syls = 1;
return syls
}
if (word.slice(word.length-2, word.length) == "es" || word.slice(word.length-2, word.length) == "ed") {
var matcher = word.match(/[eaoui][eaoui]/g);
var matcher2 = word.match(/[eaoui][^eaoui]/g);
var checker = -1;
if (matcher != null) {
if (matcher.length > 1) {
checker = 1;
}
}
if (matcher2 != null) {
if (matcher2.length > 1) {
checker = 1;
}
}
if ((word.slice(word.length-3, word.length) != "ted") && (word.slice(word.length-3, word.length) != "tes") && (word.slice(word.length-3, word.length) != "ses") && (word.slice(word.length-3, word.length) != "ied") && (word.slice(word.length-3, word.length) != "ies") && (checker == 1)) {
disc = disc + 1;
}
}
var le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while'];
if (word[word.length-1] == "e") {
if (word.slice(word.length-2, word.length) != "le" || le_except.indexOf(word) > -1) {
disc = disc + 1
}
}
var matcher = word.match(/[eaoui][eaoui]/g);
var doubleAndtriple = 0
if (matcher != null) {
doubleAndtriple = matcher.length
}
var matcher = word.match(/[eaoui][eaoui][eaoui]/g);
var triple = 0;
if (matcher != null) {
triple = matcher.length
}
disc = disc + doubleAndtriple + triple
var numVowels = 0
var matcher = word.match(/[eaoui]/g);
if (matcher != null) {
numVowels = matcher.length
}
if (word.slice(0, 2) == "mc") {
syls = syls + 1;
}
if (word[word.length-1] == "y" && "aeoui".indexOf(word[word.length-2]) == -1) {
syls = syls + 1;
}
for (var i=0; i < word.length; i++) {
if (word[i] == "y") {
if (i != 0 && i != word.length-1) {
if ("aeoui".indexOf(word[i-1]) == -1 && "aeoui".indexOf(word[i+1]) == -1) {
syls = syls + 1;
}
}
}
}
if (word.slice(0, 3) == "tri" && "aeoui".indexOf(word[3]) > -1) {
syls = syls + 1;
}
if (word.slice(0, 2) == "bi" && "aeoui".indexOf(word[2]) > -1) {
syls = syls + 1;
}
if (word.slice(word.length-3, word.length) == "ian") {
if (word.slice(word.length-4, word.length) != "cian" && word.slice(word.length-4, word.length) != "tian") {
syls = syls + 1;
}
}
if (word.slice(0,2) == "co" && "aeoui".indexOf(word[2]) > -1) {
if (co_two.indexOf(word.slice(0,4)) > -1 || co_two.indexOf(word.slice(0,5)) > -1 || co_two.indexOf(word.slice(0,6)) > -1) {
syls = syls + 1;
}
else if (co_one.indexOf(word.slice(0,4)) > -1 || co_one.indexOf(word.slice(0,5)) > -1 || co_one.indexOf(word.slice(0,6)) > -1) {
}
else {
syls = syls + 1;
}
}
if (word.slice(0,3) == "pre" && "aeoui".indexOf(word[3]) > -1) {
if (pre_one.indexOf(word.slice(0,6)) == -1) {
syls = syls +1
}
}
var negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]
if (word.slice(word.length-3, word.length) == "n't") {
if (negative.indexOf(word) > -1) {
syls = syls + 1;
}
}
if (exception_del.indexOf(word) > -1) {
syls = syls + 1;
}
if (exception_add.indexOf(word) > -1) {
syls = syls + 1;
}
return numVowels - disc + syls
}
function syllable_count(s) {
var words = s.split(" ")
syllables = 0
for (i=0;i<words.length;i++) {
word = words[i].toLowerCase().replace(/^\W+|\W+$/gm,'')
if (word.length != 0) {
syllables = syllables + syllable_count_word(word)
}
}
return syllables
}
function polysyllable_count(s) {
var words = s.split(" ")
polysyllables = 0
for (i=0;i<words.length;i++) {
word = words[i].toLowerCase().replace(/^\W+|\W+$/gm,'')
if (word.length != 0) {
if (syllable_count_word(word) >= 3) {
polysyllables = polysyllables + 1
}
}
}
return polysyllables
}
function avg_sentence_length(s) {
wc = word_count(s)
sc = sentence_count(s)
return wc / sc
}
function avg_syllables_per_word(s) {
sc = syllable_count(s)
wc = word_count(s)
return sc / wc
}
function flesch_reading_ease(s) {
asl = avg_sentence_length(s)
asw = avg_syllables_per_word(s)
flesch = 206.835 - 1.015 * asl - 84.6 * asw
if (flesch > 120.) {
flesch = 120.
}
if (flesch < 0.) {
flesch = 0.
}
if (isNaN(flesch)) {
flesch = 120.
}
return flesch
}
function flesch_kincaid_grade(s) {
asl = avg_sentence_length(s)
asw = avg_syllables_per_word(s)
flesch_kincaid = 0.39 * asl + 11.8 * asw - 15.59
if (flesch_kincaid > 22.) {
flesch_kincaid = 22.
}
if (flesch_kincaid < 0.) {
flesch_kincaid = 0.
}
if (isNaN(flesch_kincaid)) {
flesch_kincaid = 0.
}
return flesch_kincaid
}
function smog_index(s) {
polys = polysyllable_count(s)
ns = sentence_count(s)
smog = (1.043 * Math.sqrt(polys * (30/ns)) + 3.1291)
if (smog > 22.) {
smog = 22.
}
if (isNaN(smog)) {
smog = 0.
}
return smog
}
function gunning_fog_index(s) {
polys = polysyllable_count(s)
nw = word_count(s)
asl = avg_sentence_length(s)
gunning_fog = 0.4*(asl+100*(polys/nw))
if (gunning_fog > 22.) {
gunning_fog = 22.
}
if (isNaN(gunning_fog)) {
gunning_fog = 0.
}
return gunning_fog
}
bigquery.defineFunction(
'features', // Name of the function exported to SQL
['body'], // Names of input columns
[
{'name': 'flesch_reading_ease', 'type': 'float'},
{'name': 'flesch_kincaid_grade', 'type': 'float'},
{'name': 'smog_index', 'type': 'float'},
{'name': 'gunning_fog_index', 'type': 'float'}],
features // Reference to JavaScript UDF
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment