Skip to content

Instantly share code, notes, and snippets.

@alexritter96
Last active March 15, 2018 06:25
Show Gist options
  • Save alexritter96/17a7740f50498349a870c921b71da9cc to your computer and use it in GitHub Desktop.
Save alexritter96/17a7740f50498349a870c921b71da9cc to your computer and use it in GitHub Desktop.
import math
from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize
d = cmudict.dict()
tokenizer = RegexpTokenizer(r'\w+')
def syl(word):
# returns number of syllables per word
try:
syllable = [len(list(y for y in x if y[-1].isdigit()))
for x in d[word.lower()]]
return syllable[0]
except KeyError:
print("That word isnt the dictionary")
def list_to_word(txt):
# tokenizes all words. For each token, the syl function is called.
# Returns the number of syllables for each token in a list.
len_syl = []
word = tokenizer.tokenize(txt)
for w in word:
len_syl.append(syl(w))
return len_syl
def flesch_kincaid(txt, ease=True):
# Flesch Kincaid algorithm determines the readability ease of a given text.
# Higher score indicates easier comprehension and lower score indicates more complexity
word_count = len(tokenizer.tokenize(txt))
sent_count = len(sent_tokenize(txt))
syl_int = sum(list_to_word(txt))
TWS = word_count / sent_count
TSW = syl_int / word_count
if ease:
return 206.835 - 1.015 * TWS - 84.6 * TSW
else:
return 0.39 * TWS + 11.8 * TSW - 15.59
def gunning_fog(txt):
word_count = len(tokenizer.tokenize(txt))
sent_count = len(sent_tokenize(txt))
complex_word = []
for cw in list_to_word(txt):
if cw >= 3:
complex_word.append(cw)
counter = len(complex_word)
TWS = word_count / sent_count
CWW = counter / word_count
return 0.4 * (TWS + 100 * CWW)
def smog_index(txt):
# for accuracy, there must be at least 30 sentences
sent_count = len(sent_tokenize(txt))
poly_syl = []
for poly in list_to_word(txt):
if poly >= 3:
poly_syl.append(poly)
f = len(poly_syl) * (30 / sent_count)
return 1.0430 * math.sqrt(f) + 3.1291
def coleman_liau(txt):
sents = len(sent_tokenize(txt))
words = tokenizer.tokenize(txt)
chars = []
for word in words:
for w in word:
chars.append(w)
char = len(chars)
words = len(tokenizer.tokenize(txt))
L = char / words * 100
S = sents / words * 100
return 0.0588 * L - 0.296 * S - 15.8
def ari(txt):
sents = len(sent_tokenize(txt))
words = tokenizer.tokenize(txt)
chars = []
for word in words:
for w in word:
chars.append(w)
char = len(chars)
words = len(tokenizer.tokenize(txt))
return 4.71 * (char / words) + 0.5 * (words / sents) - 21.43
with open('data.txt', 'r') as f:
data = f.read().replace('\n', '')
print(flesch_kincaid(data, ease=True))
print(gunning_fog(data))
print(smog_index(data))
print(coleman_liau(data))
print(ari(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment