jazzlw/one_gram_plotter.py

## one_gram_plotter.py
from one_gram_reader import *
import matplotlib.pyplot as plt
import numpy as np


# Returns total occurrences of the given word by adding up the counts for each year as given by word_data

def total_occurrences(word_data, word):
    total = 0
    counts = word_data.get(word, []) # returns an empty list if the word is not in word_data

    # Go through the years and add up the counts.
    for year in counts:
        total += year[1]

    return total


# Returns a list of length 26
# corresponding to the relative frequency of each letter in the dataset given by word_data

def count_letters(word_data):
    total_letters = 0
    letter_counts = {}
    alphabet = [chr(i) for i in range(ord('a'),ord('z')+1)] # list of letters in the alphabet, a to z
    letter_count_list = []

    # Go through each word in word_data and add the count of each letter to the letter_count
    # Also count the total  number of letters in total_letters.
    for word in word_data:
        occurences = total_occurrences(word_data, word)
        total_letters += occurences * len(word)

        # Go through the letters of each word and add them to letter_counts
        for letter in word:
            letter_counts[letter] = letter_counts.get(letter, 0) + occurences

    # Normalize letter_counts by the total_letters
    for letter in letter_counts:
        letter_counts[letter] /= float(total_letters)

    # Convert letter_counts into a list of counts in alphabetical order.
    for letter in alphabet:
        letter_count_list += [letter_counts.get(letter, 0)]

    return letter_count_list


# Plots frequencies of letters in English, calculated from word_data.

def bar_plot_of_letter_frequencies(word_data):
    letter_counts = count_letters(word_data)
    alphabet = [chr(i) for i in range(ord('A'),ord('Z')+1)]

    x_pos = np.arange(len(alphabet))
    plt.bar(x_pos, letter_counts, 1,  align = 'center')
    plt.xticks(x_pos, alphabet)
    plt.xlim([-.5, len(alphabet)])
    plt.ylabel('Frequency')
    plt.xlabel('Letter')
    plt.show()

# creates a log-log plot of total occurrences of each word vs the rank of that word
# and annotates the occurrences of the words in words.
def plot_aggregate_counts(word_data, words):
    word_occurrences = []
    annotated_words = {}
    annotated_counts = []
    annotated_ranks = []

    # remove any words from the annotation list that don't show up in the dataset.
    for word in words:
        if word not in word_data:
            words.remove(word)

    # Create a list of the total occurences of each word
    for word in word_data:
        word_occurrences += [total_occurrences(word_data, word)]
        if word in words:
            annotated_words[word] = total_occurrences(word_data, word)

    # sort them by rank, and make a coresponding list of ranks.
    word_occurrences.sort(reverse = True)
    ranks = range(1, len(word_occurrences)+1)

    # plot the annotated_words as words, and makes lists to plot them as stars
    for word in words:
        annotated_counts += [annotated_words[word]]
        annotated_ranks += [word_occurrences.index(annotated_words[word]) + 1]

        plt.annotate(word, xy = (1.3 * word_occurrences.index(annotated_words[word]), 1.1 * annotated_words[word]) )

    # adjust the limits of the graph so that it will be tight around the data and plot the data as a line
    plt.xlim([1, 1.1* max(ranks)])
    plt.loglog(ranks, word_occurrences)

    # When plotting a large dataset, the individual points are not useful, whereas with a small one they can be
    # this plots those points if there are less than 100 of them.
    if len(ranks) < 100:
        plt.loglog(ranks, word_occurrences, 'g.', ms = 12)
    # Plot the annotated_words as stars.
    plt.loglog(annotated_ranks, annotated_counts, 'r*', ms = 12)

    plt.xlabel("Rank of Word")
    plt.ylabel("Total Occurrences")
    plt.show()


def most_common_words(word_data, n):
    words = []
    counts = []
    top_words =[]
    for word in word_data:
        words += [word]
        counts += [total_occurrences(word_data, word)]

    ranked_counts = sorted(counts, reverse = True)

    for i in range(n):
        top_words += [(words[counts.index(ranked_counts[i])], ranked_counts[i] )]

    return top_words

def get_occurrences_in_year(word_data, word, year):
    counts = word_data.get(word, [])

    for count in counts:
        if count[0] == year:
            return count[1]

    return 0


#  Returns the average word length in the data for the year given.
def get_average_word_length(word_data, year):
    total_words = 0
    total_letters = 0

    # Goes through each word in word_data and gets the total number of occurrences and then updates
    # total_words and total_letters accordingly.
    for word in word_data:
        occurrences = get_occurrences_in_year(word_data, word, year)
        total_words += occurrences
        total_letters += occurrences * len(word)
    try:
        return float(total_letters) / total_words
    except:
        return 0


# Plots the average word length for each year in year range for which there is data.
def plot_average_word_length(word_data, year_range):
    year_list = []
    length_list = []

    # go through the years and get the average word length for each
    for year in range(year_range[0], year_range[1] + 1):
        length = get_average_word_length(word_data, year)

        # no data is represented by length = 0, so don't include those years.
        if length:
            print length
            length_list += [length]
            year_list += [year]


    #plot everything.
    plt.plot(year_list, length_list)
    plt.plot(year_list, length_list, 'r.')

    plt.xlabel('Year')
    plt.ylabel('Average Word Length')
    plt.show()


## some test code

word_data = read_entire_word_file("words_that_start_with_q.csv")
# word_data = read_entire_word_file("all_words.csv")
# word_data = read_entire_word_file("very_short.csv")
print word_data['question']

plot_average_word_length(word_data, [1600, 2008])

#
# word_data = read_entire_word_file("words_that_start_with_q.csv")
# #word_data = read_entire_word_file("all_words.csv")
# print len(word_data)
# plot_aggregate_counts(word_data, ["quest", "questions", "he"])
#

## one_gram_reader.py
import csv


# Given a word and a date range, returns two lists,
# years lists all the years the word was found within the year range,
# counts contains the number of occurences for the word during the given year,
# indexes in each list correspond to each other
# If your not sure that your input is alphabetical, comment out the elif statement and the break.
# With an alphabetical word_file, as all of the google sourced ones are, the elif will save some time.

def read_word_file(word, year_range, word_file):
    word_found = False
    years = []
    counts = []

    # Using the with as construction automates the f.close() operation
    with open(word_file, "rb") as f:
        csv_reader = csv.reader(f, delimiter='\t')

        for row in csv_reader:
            if row[0] == word and int(row[1]) >= year_range[0] and int(row[1]) <= year_range[1]:
                word_found = True
                years += [int(row[1])]
                counts += [int(row[2])]

            elif word_found:  ## comment out this block if you're not sure that your input is alphabetical.
                break

    return years, counts


# Creates a dictionary indexed on years, where each value is the total
# number of words recorded from that year

def read_total_counts(total_file):
    count_dict = {}

    with open(total_file, "rb") as f:
        csv_reader = csv.reader(f, delimiter=',')

        for row in csv_reader:
            count_dict[int(row[0])] = int(row[1])

    return count_dict


# Reads the file specified in word_file and returns a dictionary of words mapped to
# lists of tuples, wheere each tuple is a year and count pair for that word.
def read_entire_word_file(word_file):
    word_data = {}

    with open(word_file, 'rb') as f:
        csv_reader = csv.reader(f, delimiter='\t')

        for row in csv_reader:
            word_data[row[0]] = word_data.get(row[0], []) + [(int(row[1]), int(row[2]))]

    return word_data
	from one_gram_reader import *
	import matplotlib.pyplot as plt
	import numpy as np



	# Returns total occurrences of the given word by adding up the counts for each year as given by word_data

	def total_occurrences(word_data, word):
	total = 0
	counts = word_data.get(word, []) # returns an empty list if the word is not in word_data

	# Go through the years and add up the counts.
	for year in counts:
	total += year[1]

	return total




	# Returns a list of length 26
	# corresponding to the relative frequency of each letter in the dataset given by word_data

	def count_letters(word_data):
	total_letters = 0
	letter_counts = {}
	alphabet = [chr(i) for i in range(ord('a'),ord('z')+1)] # list of letters in the alphabet, a to z
	letter_count_list = []

	# Go through each word in word_data and add the count of each letter to the letter_count
	# Also count the total number of letters in total_letters.
	for word in word_data:
	occurences = total_occurrences(word_data, word)
	total_letters += occurences * len(word)

	# Go through the letters of each word and add them to letter_counts
	for letter in word:
	letter_counts[letter] = letter_counts.get(letter, 0) + occurences

	# Normalize letter_counts by the total_letters
	for letter in letter_counts:
	letter_counts[letter] /= float(total_letters)

	# Convert letter_counts into a list of counts in alphabetical order.
	for letter in alphabet:
	letter_count_list += [letter_counts.get(letter, 0)]

	return letter_count_list



	# Plots frequencies of letters in English, calculated from word_data.

	def bar_plot_of_letter_frequencies(word_data):
	letter_counts = count_letters(word_data)
	alphabet = [chr(i) for i in range(ord('A'),ord('Z')+1)]

	x_pos = np.arange(len(alphabet))
	plt.bar(x_pos, letter_counts, 1, align = 'center')
	plt.xticks(x_pos, alphabet)
	plt.xlim([-.5, len(alphabet)])
	plt.ylabel('Frequency')
	plt.xlabel('Letter')
	plt.show()

	# creates a log-log plot of total occurrences of each word vs the rank of that word
	# and annotates the occurrences of the words in words.
	def plot_aggregate_counts(word_data, words):
	word_occurrences = []
	annotated_words = {}
	annotated_counts = []
	annotated_ranks = []

	# remove any words from the annotation list that don't show up in the dataset.
	for word in words:
	if word not in word_data:
	words.remove(word)

	# Create a list of the total occurences of each word
	for word in word_data:
	word_occurrences += [total_occurrences(word_data, word)]
	if word in words:
	annotated_words[word] = total_occurrences(word_data, word)

	# sort them by rank, and make a coresponding list of ranks.
	word_occurrences.sort(reverse = True)
	ranks = range(1, len(word_occurrences)+1)

	# plot the annotated_words as words, and makes lists to plot them as stars
	for word in words:
	annotated_counts += [annotated_words[word]]
	annotated_ranks += [word_occurrences.index(annotated_words[word]) + 1]

	plt.annotate(word, xy = (1.3 * word_occurrences.index(annotated_words[word]), 1.1 * annotated_words[word]) )

	# adjust the limits of the graph so that it will be tight around the data and plot the data as a line
	plt.xlim([1, 1.1* max(ranks)])
	plt.loglog(ranks, word_occurrences)

	# When plotting a large dataset, the individual points are not useful, whereas with a small one they can be
	# this plots those points if there are less than 100 of them.
	if len(ranks) < 100:
	plt.loglog(ranks, word_occurrences, 'g.', ms = 12)
	# Plot the annotated_words as stars.
	plt.loglog(annotated_ranks, annotated_counts, 'r*', ms = 12)

	plt.xlabel("Rank of Word")
	plt.ylabel("Total Occurrences")
	plt.show()




	def most_common_words(word_data, n):
	words = []
	counts = []
	top_words =[]
	for word in word_data:
	words += [word]
	counts += [total_occurrences(word_data, word)]

	ranked_counts = sorted(counts, reverse = True)

	for i in range(n):
	top_words += [(words[counts.index(ranked_counts[i])], ranked_counts[i] )]

	return top_words

	def get_occurrences_in_year(word_data, word, year):
	counts = word_data.get(word, [])

	for count in counts:
	if count[0] == year:
	return count[1]

	return 0


	# Returns the average word length in the data for the year given.
	def get_average_word_length(word_data, year):
	total_words = 0
	total_letters = 0

	# Goes through each word in word_data and gets the total number of occurrences and then updates
	# total_words and total_letters accordingly.
	for word in word_data:
	occurrences = get_occurrences_in_year(word_data, word, year)
	total_words += occurrences
	total_letters += occurrences * len(word)
	try:
	return float(total_letters) / total_words
	except:
	return 0



	# Plots the average word length for each year in year range for which there is data.
	def plot_average_word_length(word_data, year_range):
	year_list = []
	length_list = []

	# go through the years and get the average word length for each
	for year in range(year_range[0], year_range[1] + 1):
	length = get_average_word_length(word_data, year)

	# no data is represented by length = 0, so don't include those years.
	if length:
	print length
	length_list += [length]
	year_list += [year]


	#plot everything.
	plt.plot(year_list, length_list)
	plt.plot(year_list, length_list, 'r.')

	plt.xlabel('Year')
	plt.ylabel('Average Word Length')
	plt.show()




	## some test code

	word_data = read_entire_word_file("words_that_start_with_q.csv")
	# word_data = read_entire_word_file("all_words.csv")
	# word_data = read_entire_word_file("very_short.csv")
	print word_data['question']

	plot_average_word_length(word_data, [1600, 2008])

	#
	# word_data = read_entire_word_file("words_that_start_with_q.csv")
	# #word_data = read_entire_word_file("all_words.csv")
	# print len(word_data)
	# plot_aggregate_counts(word_data, ["quest", "questions", "he"])
	#
	import csv


	# Given a word and a date range, returns two lists,
	# years lists all the years the word was found within the year range,
	# counts contains the number of occurences for the word during the given year,
	# indexes in each list correspond to each other
	# If your not sure that your input is alphabetical, comment out the elif statement and the break.
	# With an alphabetical word_file, as all of the google sourced ones are, the elif will save some time.

	def read_word_file(word, year_range, word_file):
	word_found = False
	years = []
	counts = []

	# Using the with as construction automates the f.close() operation
	with open(word_file, "rb") as f:
	csv_reader = csv.reader(f, delimiter='\t')

	for row in csv_reader:
	if row[0] == word and int(row[1]) >= year_range[0] and int(row[1]) <= year_range[1]:
	word_found = True
	years += [int(row[1])]
	counts += [int(row[2])]

	elif word_found: ## comment out this block if you're not sure that your input is alphabetical.
	break

	return years, counts


	# Creates a dictionary indexed on years, where each value is the total
	# number of words recorded from that year

	def read_total_counts(total_file):
	count_dict = {}

	with open(total_file, "rb") as f:
	csv_reader = csv.reader(f, delimiter=',')

	for row in csv_reader:
	count_dict[int(row[0])] = int(row[1])

	return count_dict


	# Reads the file specified in word_file and returns a dictionary of words mapped to
	# lists of tuples, wheere each tuple is a year and count pair for that word.
	def read_entire_word_file(word_file):
	word_data = {}

	with open(word_file, 'rb') as f:
	csv_reader = csv.reader(f, delimiter='\t')

	for row in csv_reader:
	word_data[row[0]] = word_data.get(row[0], []) + [(int(row[1]), int(row[2]))]

	return word_data