Skip to content

Instantly share code, notes, and snippets.

@lievcin
Created February 12, 2018 09:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lievcin/f28131022075a2eb47e479d560ce8a9c to your computer and use it in GitHub Desktop.
Save lievcin/f28131022075a2eb47e479d560ce8a9c to your computer and use it in GitHub Desktop.
#Looking for the Flesch-Kincaid Readability measure across our pipulation
from nltk.tokenize import sent_tokenize, word_tokenize
import pyphen
dic = pyphen.Pyphen(lang='en')
data['word_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: len(word_tokenize(row['word_count'])), axis=1)
data['sent_count'] = data.apply(lambda row: len(sent_tokenize(row['review_text'])), axis=1)
data['syll_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1)
data['syll_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['syll_count']), axis=1)
data['syll_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['syll_count']), axis=1)
data['syll_count'] = data.apply(lambda row: len(list(flatten([dic.inserted(text).split('-') for text in word_tokenize(row['syll_count'])]))), axis=1)
data['flesch_kincaid'] = data.apply(lambda row: 206.835 - 1.015*row['word_count']/row['sent_count'] - 84.6*row['syll_count']/row['word_count'], axis=1)
real_data = data[data['label']== 'real']
fake_data = data[data['label']== 'fake']
fig, axes = plt.subplots(1, 2)
fig.set_size_inches(20,8)
print('REAL vs FAKE F-K Score')
plt.title('Real Reviews')
real_data.hist('flesch_kincaid', bins=100, ax=axes[0])
plt.title('Fake Reviews')
fake_data.hist('flesch_kincaid', bins=100, ax=axes[1])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment