rohitsuratekar/orf_analysis.py

## orf_analysis.py
"""
Rohit Suratekar
September 2018
We will use universal tagger for this analysis
http://www.nltk.org/book/ch05.html#tab-universal-tagset
I have not included text content of books and proposals to avoid any
copyright issues. You can download them from respective websites mentioned
below.
"""
from collections import Counter, defaultdict

import matplotlib
import matplotlib.pylab as plt
import nltk
from SecretColors.palette import Palette
from openpyxl import load_workbook
from wordcloud import WordCloud

# Uncomment following if you get error in running the script
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
# Download from
# https://wellcome.ac.uk/funding/open-research-fund-applications-submitted
FILE_NAME = "open-research-fund-proposals-received-2018.xlsx"
# You might want to download this from
# https://archive.org/stream/ToKillAMockingbird_201604/To%20Kill%20A%20Mockingbird_djvu.txt
# I removed first header part and started directly from "Chapter 1"
MOCKING_BIRD = "mockingbird.txt"
# You might want to download this from
# https://archive.org/stream/originofspecies00darwuoft/originofspecies00darwuoft_djvu.txt
# I removed first header part and started directly from "Introduction"
ORIGIN_SPECIES = "origin.txt"
ibm = Palette()


def correct(text):
    """
    nltk does not work nicely with quotation marks, hence remove it
    Plus few other punctuations which it won't capture
    This might lead to missing out few words but for now, I could't find any
    better way
    """
    return text.replace("”", "") \
        .replace("“", "").replace("’", "").replace("—", " ").replace("‘", " ")


def get_data():
    """
    Tokenize the proposal data
    :return: dictionary of counters with all tags
    """
    wb = load_workbook(FILE_NAME)
    p = wb['Data']

    word_count = defaultdict(Counter)
    skip_header = True
    for row in p.rows:
        if not skip_header:
            details = row[2].value
            tokens = nltk.word_tokenize(correct(details))
            # For simplicity we will use only 'universal' tags
            tagged = nltk.pos_tag(tokens, tagset='universal')
            for t in tagged:
                # Remove punctuations
                if t[1] != ".":
                    word_count[t[1]].update({t[0].lower()})
        else:
            skip_header = False
    return word_count


def general_statistics():
    """
    General statistics about proposals
    """
    wb = load_workbook(FILE_NAME)
    p = wb['Data']
    proposal_size = []
    title_size = []
    words_size = []
    skip_header = True
    for row in p.rows:
        if not skip_header:
            title = row[1].value
            details = row[2].value
            title_size.append(len(title))
            words_size.append(len(correct(details).split(" ")))
            proposal_size.append(len(correct(details)))
        else:
            skip_header = False
    print("Number of proposals: %d" % len(proposal_size))
    print("Average title size: %.2f" % (sum(title_size) / len(title_size)))
    print("Average words per proposal: %.2f" % (sum(words_size) / len(
        words_size)))
    print(
        "Average proposal size: %.2f" % (sum(proposal_size) / len(
            proposal_size)))

    print("Total characters from all proposals: %d" % sum(proposal_size))


def analyze_books(filename):
    """
    General function to analyse  book in text format
    :param filename:
    :return:
    """
    # Number of characters present in the proposal
    total_char_in_proposal = 476381
    full_text = ""
    with open(filename, encoding='utf-8') as f:
        for line in f:
            full_text += " " + correct(line.strip())

    required_text = full_text[: total_char_in_proposal]
    word_count = defaultdict(Counter)
    tokens = nltk.word_tokenize(required_text)
    # For simplicity we will use only 'universal' tags
    tagged = nltk.pos_tag(tokens, tagset='universal')
    for t in tagged:
        # Remove punctuations
        if t[1] != ".":
            word_count[t[1]].update({t[0].lower()})

    return word_count


def plot_common(list_of_counters: list, label_list: list, tag: str):
    names = []
    # colors = [ibm.blue(grade=20), ibm.blue(grade=50), ibm.blue(grade=80)]
    # colors = [ibm.green(grade=20), ibm.green(grade=50), ibm.green(grade=80)]
    # colors = [ibm.red(grade=20), ibm.red(grade=50), ibm.red(grade=80)]
    # colors = [ibm.teal(grade=20), ibm.teal(grade=50), ibm.teal(grade=80)]
    # colors = [ibm.lime(grade=20), ibm.lime(grade=50), ibm.lime(grade=80)]
    colors = [ibm.peach(grade=20), ibm.peach(grade=50), ibm.peach(grade=80)]

    fig, ax = plt.subplots()
    loc = 0
    col_index = 0
    for items in list_of_counters:
        item = items[tag]  # type: Counter
        current_col = colors[col_index]
        label_added = False
        # current_list = item.most_common(20)
        current_list = item.most_common(6)
        current_list.reverse()
        for word in current_list:
            if not label_added:
                plt.barh(loc, word[1], color=current_col,
                         label=label_list[col_index])
                label_added = True
            else:
                plt.barh(loc, word[1], color=current_col)
            names.append(word[0])
            loc += 1
        col_index += 1
    plt.yticks(range(len(names)), names)

    # Reverse the order of legends for better visualization
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1])
    plt.title(tag)
    plt.xlabel("Frequency")
    plt.show()


def most_common():
    proposal = get_data()
    mocking = analyze_books(MOCKING_BIRD)
    origin = analyze_books(ORIGIN_SPECIES)
    labels = ["To Kill a Mockingbird", "On the Origin of Species",
              "ORF Applications"]
    # plot_common([mocking, origin, proposal], labels, 'ADJ')
    # plot_common([mocking, origin, proposal], labels, 'ADP')
    # plot_common([mocking, origin, proposal], labels, 'ADV')
    # plot_common([mocking, origin, proposal], labels, 'VERB')
    plot_common([mocking, origin, proposal], labels, 'NOUN')


def only_orf():
    proposal = get_data()
    labels = ["ORF Applications"]
    plot_common([proposal], labels, 'NOUN')


def generate_word_cloud():
    proposal = get_data()["NOUN"]
    # proposal = analyze_books(ORIGIN_SPECIES)["NOUN"]
    # proposal = analyze_books(MOCKING_BIRD)["NOUN"]
    text = {}
    for item in proposal.most_common():
        text[item[0]] = item[1]

    cmap = ibm.cmap_of(matplotlib, ibm.violet(), start_grade=30)
    wordcloud = WordCloud(background_color="white",
                          collocations=False, colormap=cmap,
                          max_words=100).fit_words(text)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    matplotlib.rc("savefig", dpi=300)
    plt.savefig('word_cloud.png', format='png', dpi=300)
    plt.show()


if __name__ == "__main__":
    generate_word_cloud()
	"""
	Rohit Suratekar
	September 2018
	We will use universal tagger for this analysis
	http://www.nltk.org/book/ch05.html#tab-universal-tagset
	I have not included text content of books and proposals to avoid any
	copyright issues. You can download them from respective websites mentioned
	below.
	"""
	from collections import Counter, defaultdict

	import matplotlib
	import matplotlib.pylab as plt
	import nltk
	from SecretColors.palette import Palette
	from openpyxl import load_workbook
	from wordcloud import WordCloud

	# Uncomment following if you get error in running the script
	# nltk.download('punkt')
	# nltk.download('averaged_perceptron_tagger')
	# nltk.download('universal_tagset')
	# Download from
	# https://wellcome.ac.uk/funding/open-research-fund-applications-submitted
	FILE_NAME = "open-research-fund-proposals-received-2018.xlsx"
	# You might want to download this from
	# https://archive.org/stream/ToKillAMockingbird_201604/To%20Kill%20A%20Mockingbird_djvu.txt
	# I removed first header part and started directly from "Chapter 1"
	MOCKING_BIRD = "mockingbird.txt"
	# You might want to download this from
	# https://archive.org/stream/originofspecies00darwuoft/originofspecies00darwuoft_djvu.txt
	# I removed first header part and started directly from "Introduction"
	ORIGIN_SPECIES = "origin.txt"
	ibm = Palette()


	def correct(text):
	"""
	nltk does not work nicely with quotation marks, hence remove it
	Plus few other punctuations which it won't capture
	This might lead to missing out few words but for now, I could't find any
	better way
	"""
	return text.replace("”", "") \
	.replace("“", "").replace("’", "").replace("—", " ").replace("‘", " ")


	def get_data():
	"""
	Tokenize the proposal data
	:return: dictionary of counters with all tags
	"""
	wb = load_workbook(FILE_NAME)
	p = wb['Data']

	word_count = defaultdict(Counter)
	skip_header = True
	for row in p.rows:
	if not skip_header:
	details = row[2].value
	tokens = nltk.word_tokenize(correct(details))
	# For simplicity we will use only 'universal' tags
	tagged = nltk.pos_tag(tokens, tagset='universal')
	for t in tagged:
	# Remove punctuations
	if t[1] != ".":
	word_count[t[1]].update({t[0].lower()})
	else:
	skip_header = False
	return word_count


	def general_statistics():
	"""
	General statistics about proposals
	"""
	wb = load_workbook(FILE_NAME)
	p = wb['Data']
	proposal_size = []
	title_size = []
	words_size = []
	skip_header = True
	for row in p.rows:
	if not skip_header:
	title = row[1].value
	details = row[2].value
	title_size.append(len(title))
	words_size.append(len(correct(details).split(" ")))
	proposal_size.append(len(correct(details)))
	else:
	skip_header = False
	print("Number of proposals: %d" % len(proposal_size))
	print("Average title size: %.2f" % (sum(title_size) / len(title_size)))
	print("Average words per proposal: %.2f" % (sum(words_size) / len(
	words_size)))
	print(
	"Average proposal size: %.2f" % (sum(proposal_size) / len(
	proposal_size)))

	print("Total characters from all proposals: %d" % sum(proposal_size))


	def analyze_books(filename):
	"""
	General function to analyse book in text format
	:param filename:
	:return:
	"""
	# Number of characters present in the proposal
	total_char_in_proposal = 476381
	full_text = ""
	with open(filename, encoding='utf-8') as f:
	for line in f:
	full_text += " " + correct(line.strip())

	required_text = full_text[: total_char_in_proposal]
	word_count = defaultdict(Counter)
	tokens = nltk.word_tokenize(required_text)
	# For simplicity we will use only 'universal' tags
	tagged = nltk.pos_tag(tokens, tagset='universal')
	for t in tagged:
	# Remove punctuations
	if t[1] != ".":
	word_count[t[1]].update({t[0].lower()})

	return word_count


	def plot_common(list_of_counters: list, label_list: list, tag: str):
	names = []
	# colors = [ibm.blue(grade=20), ibm.blue(grade=50), ibm.blue(grade=80)]
	# colors = [ibm.green(grade=20), ibm.green(grade=50), ibm.green(grade=80)]
	# colors = [ibm.red(grade=20), ibm.red(grade=50), ibm.red(grade=80)]
	# colors = [ibm.teal(grade=20), ibm.teal(grade=50), ibm.teal(grade=80)]
	# colors = [ibm.lime(grade=20), ibm.lime(grade=50), ibm.lime(grade=80)]
	colors = [ibm.peach(grade=20), ibm.peach(grade=50), ibm.peach(grade=80)]

	fig, ax = plt.subplots()
	loc = 0
	col_index = 0
	for items in list_of_counters:
	item = items[tag] # type: Counter
	current_col = colors[col_index]
	label_added = False
	# current_list = item.most_common(20)
	current_list = item.most_common(6)
	current_list.reverse()
	for word in current_list:
	if not label_added:
	plt.barh(loc, word[1], color=current_col,
	label=label_list[col_index])
	label_added = True
	else:
	plt.barh(loc, word[1], color=current_col)
	names.append(word[0])
	loc += 1
	col_index += 1
	plt.yticks(range(len(names)), names)

	# Reverse the order of legends for better visualization
	handles, labels = ax.get_legend_handles_labels()
	ax.legend(handles[::-1], labels[::-1])
	plt.title(tag)
	plt.xlabel("Frequency")
	plt.show()


	def most_common():
	proposal = get_data()
	mocking = analyze_books(MOCKING_BIRD)
	origin = analyze_books(ORIGIN_SPECIES)
	labels = ["To Kill a Mockingbird", "On the Origin of Species",
	"ORF Applications"]
	# plot_common([mocking, origin, proposal], labels, 'ADJ')
	# plot_common([mocking, origin, proposal], labels, 'ADP')
	# plot_common([mocking, origin, proposal], labels, 'ADV')
	# plot_common([mocking, origin, proposal], labels, 'VERB')
	plot_common([mocking, origin, proposal], labels, 'NOUN')


	def only_orf():
	proposal = get_data()
	labels = ["ORF Applications"]
	plot_common([proposal], labels, 'NOUN')


	def generate_word_cloud():
	proposal = get_data()["NOUN"]
	# proposal = analyze_books(ORIGIN_SPECIES)["NOUN"]
	# proposal = analyze_books(MOCKING_BIRD)["NOUN"]
	text = {}
	for item in proposal.most_common():
	text[item[0]] = item[1]

	cmap = ibm.cmap_of(matplotlib, ibm.violet(), start_grade=30)
	wordcloud = WordCloud(background_color="white",
	collocations=False, colormap=cmap,
	max_words=100).fit_words(text)

	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	matplotlib.rc("savefig", dpi=300)
	plt.savefig('word_cloud.png', format='png', dpi=300)
	plt.show()


	if __name__ == "__main__":
	generate_word_cloud()