Last active
September 15, 2018 12:39
-
-
Save rohitsuratekar/a76ec4c4abb9e041d5d6e8a4fc4683d1 to your computer and use it in GitHub Desktop.
Code used in the blog post "Grant Proposal, Science Book and Fiction"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Rohit Suratekar | |
September 2018 | |
We will use universal tagger for this analysis | |
http://www.nltk.org/book/ch05.html#tab-universal-tagset | |
I have not included text content of books and proposals to avoid any | |
copyright issues. You can download them from respective websites mentioned | |
below. | |
""" | |
from collections import Counter, defaultdict | |
import matplotlib | |
import matplotlib.pylab as plt | |
import nltk | |
from SecretColors.palette import Palette | |
from openpyxl import load_workbook | |
from wordcloud import WordCloud | |
# Uncomment following if you get error in running the script | |
# nltk.download('punkt') | |
# nltk.download('averaged_perceptron_tagger') | |
# nltk.download('universal_tagset') | |
# Download from | |
# https://wellcome.ac.uk/funding/open-research-fund-applications-submitted | |
FILE_NAME = "open-research-fund-proposals-received-2018.xlsx" | |
# You might want to download this from | |
# https://archive.org/stream/ToKillAMockingbird_201604/To%20Kill%20A%20Mockingbird_djvu.txt | |
# I removed first header part and started directly from "Chapter 1" | |
MOCKING_BIRD = "mockingbird.txt" | |
# You might want to download this from | |
# https://archive.org/stream/originofspecies00darwuoft/originofspecies00darwuoft_djvu.txt | |
# I removed first header part and started directly from "Introduction" | |
ORIGIN_SPECIES = "origin.txt" | |
ibm = Palette() | |
def correct(text): | |
""" | |
nltk does not work nicely with quotation marks, hence remove it | |
Plus few other punctuations which it won't capture | |
This might lead to missing out few words but for now, I could't find any | |
better way | |
""" | |
return text.replace("”", "") \ | |
.replace("“", "").replace("’", "").replace("—", " ").replace("‘", " ") | |
def get_data(): | |
""" | |
Tokenize the proposal data | |
:return: dictionary of counters with all tags | |
""" | |
wb = load_workbook(FILE_NAME) | |
p = wb['Data'] | |
word_count = defaultdict(Counter) | |
skip_header = True | |
for row in p.rows: | |
if not skip_header: | |
details = row[2].value | |
tokens = nltk.word_tokenize(correct(details)) | |
# For simplicity we will use only 'universal' tags | |
tagged = nltk.pos_tag(tokens, tagset='universal') | |
for t in tagged: | |
# Remove punctuations | |
if t[1] != ".": | |
word_count[t[1]].update({t[0].lower()}) | |
else: | |
skip_header = False | |
return word_count | |
def general_statistics(): | |
""" | |
General statistics about proposals | |
""" | |
wb = load_workbook(FILE_NAME) | |
p = wb['Data'] | |
proposal_size = [] | |
title_size = [] | |
words_size = [] | |
skip_header = True | |
for row in p.rows: | |
if not skip_header: | |
title = row[1].value | |
details = row[2].value | |
title_size.append(len(title)) | |
words_size.append(len(correct(details).split(" "))) | |
proposal_size.append(len(correct(details))) | |
else: | |
skip_header = False | |
print("Number of proposals: %d" % len(proposal_size)) | |
print("Average title size: %.2f" % (sum(title_size) / len(title_size))) | |
print("Average words per proposal: %.2f" % (sum(words_size) / len( | |
words_size))) | |
print( | |
"Average proposal size: %.2f" % (sum(proposal_size) / len( | |
proposal_size))) | |
print("Total characters from all proposals: %d" % sum(proposal_size)) | |
def analyze_books(filename): | |
""" | |
General function to analyse book in text format | |
:param filename: | |
:return: | |
""" | |
# Number of characters present in the proposal | |
total_char_in_proposal = 476381 | |
full_text = "" | |
with open(filename, encoding='utf-8') as f: | |
for line in f: | |
full_text += " " + correct(line.strip()) | |
required_text = full_text[: total_char_in_proposal] | |
word_count = defaultdict(Counter) | |
tokens = nltk.word_tokenize(required_text) | |
# For simplicity we will use only 'universal' tags | |
tagged = nltk.pos_tag(tokens, tagset='universal') | |
for t in tagged: | |
# Remove punctuations | |
if t[1] != ".": | |
word_count[t[1]].update({t[0].lower()}) | |
return word_count | |
def plot_common(list_of_counters: list, label_list: list, tag: str): | |
names = [] | |
# colors = [ibm.blue(grade=20), ibm.blue(grade=50), ibm.blue(grade=80)] | |
# colors = [ibm.green(grade=20), ibm.green(grade=50), ibm.green(grade=80)] | |
# colors = [ibm.red(grade=20), ibm.red(grade=50), ibm.red(grade=80)] | |
# colors = [ibm.teal(grade=20), ibm.teal(grade=50), ibm.teal(grade=80)] | |
# colors = [ibm.lime(grade=20), ibm.lime(grade=50), ibm.lime(grade=80)] | |
colors = [ibm.peach(grade=20), ibm.peach(grade=50), ibm.peach(grade=80)] | |
fig, ax = plt.subplots() | |
loc = 0 | |
col_index = 0 | |
for items in list_of_counters: | |
item = items[tag] # type: Counter | |
current_col = colors[col_index] | |
label_added = False | |
# current_list = item.most_common(20) | |
current_list = item.most_common(6) | |
current_list.reverse() | |
for word in current_list: | |
if not label_added: | |
plt.barh(loc, word[1], color=current_col, | |
label=label_list[col_index]) | |
label_added = True | |
else: | |
plt.barh(loc, word[1], color=current_col) | |
names.append(word[0]) | |
loc += 1 | |
col_index += 1 | |
plt.yticks(range(len(names)), names) | |
# Reverse the order of legends for better visualization | |
handles, labels = ax.get_legend_handles_labels() | |
ax.legend(handles[::-1], labels[::-1]) | |
plt.title(tag) | |
plt.xlabel("Frequency") | |
plt.show() | |
def most_common(): | |
proposal = get_data() | |
mocking = analyze_books(MOCKING_BIRD) | |
origin = analyze_books(ORIGIN_SPECIES) | |
labels = ["To Kill a Mockingbird", "On the Origin of Species", | |
"ORF Applications"] | |
# plot_common([mocking, origin, proposal], labels, 'ADJ') | |
# plot_common([mocking, origin, proposal], labels, 'ADP') | |
# plot_common([mocking, origin, proposal], labels, 'ADV') | |
# plot_common([mocking, origin, proposal], labels, 'VERB') | |
plot_common([mocking, origin, proposal], labels, 'NOUN') | |
def only_orf(): | |
proposal = get_data() | |
labels = ["ORF Applications"] | |
plot_common([proposal], labels, 'NOUN') | |
def generate_word_cloud(): | |
proposal = get_data()["NOUN"] | |
# proposal = analyze_books(ORIGIN_SPECIES)["NOUN"] | |
# proposal = analyze_books(MOCKING_BIRD)["NOUN"] | |
text = {} | |
for item in proposal.most_common(): | |
text[item[0]] = item[1] | |
cmap = ibm.cmap_of(matplotlib, ibm.violet(), start_grade=30) | |
wordcloud = WordCloud(background_color="white", | |
collocations=False, colormap=cmap, | |
max_words=100).fit_words(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | |
matplotlib.rc("savefig", dpi=300) | |
plt.savefig('word_cloud.png', format='png', dpi=300) | |
plt.show() | |
if __name__ == "__main__": | |
generate_word_cloud() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment