alexritter96/fleschkincaid.py

## fleschkincaid.py
import math

from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize

d = cmudict.dict()
tokenizer = RegexpTokenizer(r'\w+')


def syl(word):
    # returns number of syllables per word
    try:
        syllable = [len(list(y for y in x if y[-1].isdigit()))
                    for x in d[word.lower()]]
        return syllable[0]
    except KeyError:
        print("That word isnt the dictionary")


def list_to_word(txt):
    # tokenizes all words. For each token, the syl function is called.
    # Returns the number of syllables for each token in a list.
    len_syl = []
    word = tokenizer.tokenize(txt)

    for w in word:
        len_syl.append(syl(w))

    return len_syl


def flesch_kincaid(txt, ease=True):
    # Flesch Kincaid algorithm determines the readability ease of a given text.
    # Higher score indicates easier comprehension and lower score indicates more complexity
    word_count = len(tokenizer.tokenize(txt))
    sent_count = len(sent_tokenize(txt))
    syl_int = sum(list_to_word(txt))
    TWS = word_count / sent_count
    TSW = syl_int / word_count

    if ease:
        return 206.835 - 1.015 * TWS - 84.6 * TSW

    else:
        return 0.39 * TWS + 11.8 * TSW - 15.59


def gunning_fog(txt):
    word_count = len(tokenizer.tokenize(txt))
    sent_count = len(sent_tokenize(txt))
    complex_word = []

    for cw in list_to_word(txt):
        if cw >= 3:
            complex_word.append(cw)

    counter = len(complex_word)
    TWS = word_count / sent_count
    CWW = counter / word_count
    return 0.4 * (TWS + 100 * CWW)


def smog_index(txt):
    # for accuracy, there must be at least 30 sentences
    sent_count = len(sent_tokenize(txt))
    poly_syl = []

    for poly in list_to_word(txt):
        if poly >= 3:
            poly_syl.append(poly)

    f = len(poly_syl) * (30 / sent_count)
    return 1.0430 * math.sqrt(f) + 3.1291


def coleman_liau(txt):
    sents = len(sent_tokenize(txt))
    words = tokenizer.tokenize(txt)
    chars = []

    for word in words:
        for w in word:
            chars.append(w)

    char = len(chars)
    words = len(tokenizer.tokenize(txt))
    L = char / words * 100
    S = sents / words * 100
    return 0.0588 * L - 0.296 * S - 15.8


def ari(txt):
    sents = len(sent_tokenize(txt))
    words = tokenizer.tokenize(txt)
    chars = []

    for word in words:
        for w in word:
            chars.append(w)

    char = len(chars)
    words = len(tokenizer.tokenize(txt))
    return 4.71 * (char / words) + 0.5 * (words / sents) - 21.43


with open('data.txt', 'r') as f:
    data = f.read().replace('\n', '')


print(flesch_kincaid(data, ease=True))
print(gunning_fog(data))
print(smog_index(data))
print(coleman_liau(data))
print(ari(data))
	import math

	from nltk.corpus import cmudict
	from nltk.tokenize import RegexpTokenizer
	from nltk.tokenize import sent_tokenize

	d = cmudict.dict()
	tokenizer = RegexpTokenizer(r'\w+')


	def syl(word):
	# returns number of syllables per word
	try:
	syllable = [len(list(y for y in x if y[-1].isdigit()))
	for x in d[word.lower()]]
	return syllable[0]
	except KeyError:
	print("That word isnt the dictionary")


	def list_to_word(txt):
	# tokenizes all words. For each token, the syl function is called.
	# Returns the number of syllables for each token in a list.
	len_syl = []
	word = tokenizer.tokenize(txt)

	for w in word:
	len_syl.append(syl(w))

	return len_syl


	def flesch_kincaid(txt, ease=True):
	# Flesch Kincaid algorithm determines the readability ease of a given text.
	# Higher score indicates easier comprehension and lower score indicates more complexity
	word_count = len(tokenizer.tokenize(txt))
	sent_count = len(sent_tokenize(txt))
	syl_int = sum(list_to_word(txt))
	TWS = word_count / sent_count
	TSW = syl_int / word_count

	if ease:
	return 206.835 - 1.015 * TWS - 84.6 * TSW

	else:
	return 0.39 * TWS + 11.8 * TSW - 15.59


	def gunning_fog(txt):
	word_count = len(tokenizer.tokenize(txt))
	sent_count = len(sent_tokenize(txt))
	complex_word = []

	for cw in list_to_word(txt):
	if cw >= 3:
	complex_word.append(cw)

	counter = len(complex_word)
	TWS = word_count / sent_count
	CWW = counter / word_count
	return 0.4 * (TWS + 100 * CWW)


	def smog_index(txt):
	# for accuracy, there must be at least 30 sentences
	sent_count = len(sent_tokenize(txt))
	poly_syl = []

	for poly in list_to_word(txt):
	if poly >= 3:
	poly_syl.append(poly)

	f = len(poly_syl) * (30 / sent_count)
	return 1.0430 * math.sqrt(f) + 3.1291


	def coleman_liau(txt):
	sents = len(sent_tokenize(txt))
	words = tokenizer.tokenize(txt)
	chars = []

	for word in words:
	for w in word:
	chars.append(w)

	char = len(chars)
	words = len(tokenizer.tokenize(txt))
	L = char / words * 100
	S = sents / words * 100
	return 0.0588 * L - 0.296 * S - 15.8


	def ari(txt):
	sents = len(sent_tokenize(txt))
	words = tokenizer.tokenize(txt)
	chars = []

	for word in words:
	for w in word:
	chars.append(w)

	char = len(chars)
	words = len(tokenizer.tokenize(txt))
	return 4.71 * (char / words) + 0.5 * (words / sents) - 21.43


	with open('data.txt', 'r') as f:
	data = f.read().replace('\n', '')


	print(flesch_kincaid(data, ease=True))
	print(gunning_fog(data))
	print(smog_index(data))
	print(coleman_liau(data))
	print(ari(data))