Skip to content

Instantly share code, notes, and snippets.

@rohitsuratekar
Last active September 15, 2018 12:39
Show Gist options
  • Save rohitsuratekar/a76ec4c4abb9e041d5d6e8a4fc4683d1 to your computer and use it in GitHub Desktop.
Save rohitsuratekar/a76ec4c4abb9e041d5d6e8a4fc4683d1 to your computer and use it in GitHub Desktop.
Code used in the blog post "Grant Proposal, Science Book and Fiction"
"""
Rohit Suratekar
September 2018
We will use universal tagger for this analysis
http://www.nltk.org/book/ch05.html#tab-universal-tagset
I have not included text content of books and proposals to avoid any
copyright issues. You can download them from respective websites mentioned
below.
"""
from collections import Counter, defaultdict
import matplotlib
import matplotlib.pylab as plt
import nltk
from SecretColors.palette import Palette
from openpyxl import load_workbook
from wordcloud import WordCloud
# Uncomment following if you get error in running the script
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
# Download from
# https://wellcome.ac.uk/funding/open-research-fund-applications-submitted
FILE_NAME = "open-research-fund-proposals-received-2018.xlsx"
# You might want to download this from
# https://archive.org/stream/ToKillAMockingbird_201604/To%20Kill%20A%20Mockingbird_djvu.txt
# I removed first header part and started directly from "Chapter 1"
MOCKING_BIRD = "mockingbird.txt"
# You might want to download this from
# https://archive.org/stream/originofspecies00darwuoft/originofspecies00darwuoft_djvu.txt
# I removed first header part and started directly from "Introduction"
ORIGIN_SPECIES = "origin.txt"
ibm = Palette()
def correct(text):
"""
nltk does not work nicely with quotation marks, hence remove it
Plus few other punctuations which it won't capture
This might lead to missing out few words but for now, I could't find any
better way
"""
return text.replace("”", "") \
.replace("“", "").replace("’", "").replace("—", " ").replace("‘", " ")
def get_data():
"""
Tokenize the proposal data
:return: dictionary of counters with all tags
"""
wb = load_workbook(FILE_NAME)
p = wb['Data']
word_count = defaultdict(Counter)
skip_header = True
for row in p.rows:
if not skip_header:
details = row[2].value
tokens = nltk.word_tokenize(correct(details))
# For simplicity we will use only 'universal' tags
tagged = nltk.pos_tag(tokens, tagset='universal')
for t in tagged:
# Remove punctuations
if t[1] != ".":
word_count[t[1]].update({t[0].lower()})
else:
skip_header = False
return word_count
def general_statistics():
"""
General statistics about proposals
"""
wb = load_workbook(FILE_NAME)
p = wb['Data']
proposal_size = []
title_size = []
words_size = []
skip_header = True
for row in p.rows:
if not skip_header:
title = row[1].value
details = row[2].value
title_size.append(len(title))
words_size.append(len(correct(details).split(" ")))
proposal_size.append(len(correct(details)))
else:
skip_header = False
print("Number of proposals: %d" % len(proposal_size))
print("Average title size: %.2f" % (sum(title_size) / len(title_size)))
print("Average words per proposal: %.2f" % (sum(words_size) / len(
words_size)))
print(
"Average proposal size: %.2f" % (sum(proposal_size) / len(
proposal_size)))
print("Total characters from all proposals: %d" % sum(proposal_size))
def analyze_books(filename):
"""
General function to analyse book in text format
:param filename:
:return:
"""
# Number of characters present in the proposal
total_char_in_proposal = 476381
full_text = ""
with open(filename, encoding='utf-8') as f:
for line in f:
full_text += " " + correct(line.strip())
required_text = full_text[: total_char_in_proposal]
word_count = defaultdict(Counter)
tokens = nltk.word_tokenize(required_text)
# For simplicity we will use only 'universal' tags
tagged = nltk.pos_tag(tokens, tagset='universal')
for t in tagged:
# Remove punctuations
if t[1] != ".":
word_count[t[1]].update({t[0].lower()})
return word_count
def plot_common(list_of_counters: list, label_list: list, tag: str):
names = []
# colors = [ibm.blue(grade=20), ibm.blue(grade=50), ibm.blue(grade=80)]
# colors = [ibm.green(grade=20), ibm.green(grade=50), ibm.green(grade=80)]
# colors = [ibm.red(grade=20), ibm.red(grade=50), ibm.red(grade=80)]
# colors = [ibm.teal(grade=20), ibm.teal(grade=50), ibm.teal(grade=80)]
# colors = [ibm.lime(grade=20), ibm.lime(grade=50), ibm.lime(grade=80)]
colors = [ibm.peach(grade=20), ibm.peach(grade=50), ibm.peach(grade=80)]
fig, ax = plt.subplots()
loc = 0
col_index = 0
for items in list_of_counters:
item = items[tag] # type: Counter
current_col = colors[col_index]
label_added = False
# current_list = item.most_common(20)
current_list = item.most_common(6)
current_list.reverse()
for word in current_list:
if not label_added:
plt.barh(loc, word[1], color=current_col,
label=label_list[col_index])
label_added = True
else:
plt.barh(loc, word[1], color=current_col)
names.append(word[0])
loc += 1
col_index += 1
plt.yticks(range(len(names)), names)
# Reverse the order of legends for better visualization
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1])
plt.title(tag)
plt.xlabel("Frequency")
plt.show()
def most_common():
proposal = get_data()
mocking = analyze_books(MOCKING_BIRD)
origin = analyze_books(ORIGIN_SPECIES)
labels = ["To Kill a Mockingbird", "On the Origin of Species",
"ORF Applications"]
# plot_common([mocking, origin, proposal], labels, 'ADJ')
# plot_common([mocking, origin, proposal], labels, 'ADP')
# plot_common([mocking, origin, proposal], labels, 'ADV')
# plot_common([mocking, origin, proposal], labels, 'VERB')
plot_common([mocking, origin, proposal], labels, 'NOUN')
def only_orf():
proposal = get_data()
labels = ["ORF Applications"]
plot_common([proposal], labels, 'NOUN')
def generate_word_cloud():
proposal = get_data()["NOUN"]
# proposal = analyze_books(ORIGIN_SPECIES)["NOUN"]
# proposal = analyze_books(MOCKING_BIRD)["NOUN"]
text = {}
for item in proposal.most_common():
text[item[0]] = item[1]
cmap = ibm.cmap_of(matplotlib, ibm.violet(), start_grade=30)
wordcloud = WordCloud(background_color="white",
collocations=False, colormap=cmap,
max_words=100).fit_words(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
matplotlib.rc("savefig", dpi=300)
plt.savefig('word_cloud.png', format='png', dpi=300)
plt.show()
if __name__ == "__main__":
generate_word_cloud()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment