Last active
March 15, 2018 06:25
-
-
Save alexritter96/17a7740f50498349a870c921b71da9cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from nltk.corpus import cmudict | |
from nltk.tokenize import RegexpTokenizer | |
from nltk.tokenize import sent_tokenize | |
d = cmudict.dict() | |
tokenizer = RegexpTokenizer(r'\w+') | |
def syl(word): | |
# returns number of syllables per word | |
try: | |
syllable = [len(list(y for y in x if y[-1].isdigit())) | |
for x in d[word.lower()]] | |
return syllable[0] | |
except KeyError: | |
print("That word isnt the dictionary") | |
def list_to_word(txt): | |
# tokenizes all words. For each token, the syl function is called. | |
# Returns the number of syllables for each token in a list. | |
len_syl = [] | |
word = tokenizer.tokenize(txt) | |
for w in word: | |
len_syl.append(syl(w)) | |
return len_syl | |
def flesch_kincaid(txt, ease=True): | |
# Flesch Kincaid algorithm determines the readability ease of a given text. | |
# Higher score indicates easier comprehension and lower score indicates more complexity | |
word_count = len(tokenizer.tokenize(txt)) | |
sent_count = len(sent_tokenize(txt)) | |
syl_int = sum(list_to_word(txt)) | |
TWS = word_count / sent_count | |
TSW = syl_int / word_count | |
if ease: | |
return 206.835 - 1.015 * TWS - 84.6 * TSW | |
else: | |
return 0.39 * TWS + 11.8 * TSW - 15.59 | |
def gunning_fog(txt): | |
word_count = len(tokenizer.tokenize(txt)) | |
sent_count = len(sent_tokenize(txt)) | |
complex_word = [] | |
for cw in list_to_word(txt): | |
if cw >= 3: | |
complex_word.append(cw) | |
counter = len(complex_word) | |
TWS = word_count / sent_count | |
CWW = counter / word_count | |
return 0.4 * (TWS + 100 * CWW) | |
def smog_index(txt): | |
# for accuracy, there must be at least 30 sentences | |
sent_count = len(sent_tokenize(txt)) | |
poly_syl = [] | |
for poly in list_to_word(txt): | |
if poly >= 3: | |
poly_syl.append(poly) | |
f = len(poly_syl) * (30 / sent_count) | |
return 1.0430 * math.sqrt(f) + 3.1291 | |
def coleman_liau(txt): | |
sents = len(sent_tokenize(txt)) | |
words = tokenizer.tokenize(txt) | |
chars = [] | |
for word in words: | |
for w in word: | |
chars.append(w) | |
char = len(chars) | |
words = len(tokenizer.tokenize(txt)) | |
L = char / words * 100 | |
S = sents / words * 100 | |
return 0.0588 * L - 0.296 * S - 15.8 | |
def ari(txt): | |
sents = len(sent_tokenize(txt)) | |
words = tokenizer.tokenize(txt) | |
chars = [] | |
for word in words: | |
for w in word: | |
chars.append(w) | |
char = len(chars) | |
words = len(tokenizer.tokenize(txt)) | |
return 4.71 * (char / words) + 0.5 * (words / sents) - 21.43 | |
with open('data.txt', 'r') as f: | |
data = f.read().replace('\n', '') | |
print(flesch_kincaid(data, ease=True)) | |
print(gunning_fog(data)) | |
print(smog_index(data)) | |
print(coleman_liau(data)) | |
print(ari(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment