Skip to content

Instantly share code, notes, and snippets.

@jazzlw
Last active October 31, 2016 19:09
Show Gist options
  • Save jazzlw/95052e9275e09a63c990a25e8b63920f to your computer and use it in GitHub Desktop.
Save jazzlw/95052e9275e09a63c990a25e8b63920f to your computer and use it in GitHub Desktop.
onegram play
from one_gram_reader import *
import matplotlib.pyplot as plt
import numpy as np
# Returns total occurrences of the given word by adding up the counts for each year as given by word_data
def total_occurrences(word_data, word):
total = 0
counts = word_data.get(word, []) # returns an empty list if the word is not in word_data
# Go through the years and add up the counts.
for year in counts:
total += year[1]
return total
# Returns a list of length 26
# corresponding to the relative frequency of each letter in the dataset given by word_data
def count_letters(word_data):
total_letters = 0
letter_counts = {}
alphabet = [chr(i) for i in range(ord('a'),ord('z')+1)] # list of letters in the alphabet, a to z
letter_count_list = []
# Go through each word in word_data and add the count of each letter to the letter_count
# Also count the total number of letters in total_letters.
for word in word_data:
occurences = total_occurrences(word_data, word)
total_letters += occurences * len(word)
# Go through the letters of each word and add them to letter_counts
for letter in word:
letter_counts[letter] = letter_counts.get(letter, 0) + occurences
# Normalize letter_counts by the total_letters
for letter in letter_counts:
letter_counts[letter] /= float(total_letters)
# Convert letter_counts into a list of counts in alphabetical order.
for letter in alphabet:
letter_count_list += [letter_counts.get(letter, 0)]
return letter_count_list
# Plots frequencies of letters in English, calculated from word_data.
def bar_plot_of_letter_frequencies(word_data):
letter_counts = count_letters(word_data)
alphabet = [chr(i) for i in range(ord('A'),ord('Z')+1)]
x_pos = np.arange(len(alphabet))
plt.bar(x_pos, letter_counts, 1, align = 'center')
plt.xticks(x_pos, alphabet)
plt.xlim([-.5, len(alphabet)])
plt.ylabel('Frequency')
plt.xlabel('Letter')
plt.show()
# creates a log-log plot of total occurrences of each word vs the rank of that word
# and annotates the occurrences of the words in words.
def plot_aggregate_counts(word_data, words):
word_occurrences = []
annotated_words = {}
annotated_counts = []
annotated_ranks = []
# remove any words from the annotation list that don't show up in the dataset.
for word in words:
if word not in word_data:
words.remove(word)
# Create a list of the total occurences of each word
for word in word_data:
word_occurrences += [total_occurrences(word_data, word)]
if word in words:
annotated_words[word] = total_occurrences(word_data, word)
# sort them by rank, and make a coresponding list of ranks.
word_occurrences.sort(reverse = True)
ranks = range(1, len(word_occurrences)+1)
# plot the annotated_words as words, and makes lists to plot them as stars
for word in words:
annotated_counts += [annotated_words[word]]
annotated_ranks += [word_occurrences.index(annotated_words[word]) + 1]
plt.annotate(word, xy = (1.3 * word_occurrences.index(annotated_words[word]), 1.1 * annotated_words[word]) )
# adjust the limits of the graph so that it will be tight around the data and plot the data as a line
plt.xlim([1, 1.1* max(ranks)])
plt.loglog(ranks, word_occurrences)
# When plotting a large dataset, the individual points are not useful, whereas with a small one they can be
# this plots those points if there are less than 100 of them.
if len(ranks) < 100:
plt.loglog(ranks, word_occurrences, 'g.', ms = 12)
# Plot the annotated_words as stars.
plt.loglog(annotated_ranks, annotated_counts, 'r*', ms = 12)
plt.xlabel("Rank of Word")
plt.ylabel("Total Occurrences")
plt.show()
def most_common_words(word_data, n):
words = []
counts = []
top_words =[]
for word in word_data:
words += [word]
counts += [total_occurrences(word_data, word)]
ranked_counts = sorted(counts, reverse = True)
for i in range(n):
top_words += [(words[counts.index(ranked_counts[i])], ranked_counts[i] )]
return top_words
def get_occurrences_in_year(word_data, word, year):
counts = word_data.get(word, [])
for count in counts:
if count[0] == year:
return count[1]
return 0
# Returns the average word length in the data for the year given.
def get_average_word_length(word_data, year):
total_words = 0
total_letters = 0
# Goes through each word in word_data and gets the total number of occurrences and then updates
# total_words and total_letters accordingly.
for word in word_data:
occurrences = get_occurrences_in_year(word_data, word, year)
total_words += occurrences
total_letters += occurrences * len(word)
try:
return float(total_letters) / total_words
except:
return 0
# Plots the average word length for each year in year range for which there is data.
def plot_average_word_length(word_data, year_range):
year_list = []
length_list = []
# go through the years and get the average word length for each
for year in range(year_range[0], year_range[1] + 1):
length = get_average_word_length(word_data, year)
# no data is represented by length = 0, so don't include those years.
if length:
print length
length_list += [length]
year_list += [year]
#plot everything.
plt.plot(year_list, length_list)
plt.plot(year_list, length_list, 'r.')
plt.xlabel('Year')
plt.ylabel('Average Word Length')
plt.show()
## some test code
word_data = read_entire_word_file("words_that_start_with_q.csv")
# word_data = read_entire_word_file("all_words.csv")
# word_data = read_entire_word_file("very_short.csv")
print word_data['question']
plot_average_word_length(word_data, [1600, 2008])
#
# word_data = read_entire_word_file("words_that_start_with_q.csv")
# #word_data = read_entire_word_file("all_words.csv")
# print len(word_data)
# plot_aggregate_counts(word_data, ["quest", "questions", "he"])
#
import csv
# Given a word and a date range, returns two lists,
# years lists all the years the word was found within the year range,
# counts contains the number of occurences for the word during the given year,
# indexes in each list correspond to each other
# If your not sure that your input is alphabetical, comment out the elif statement and the break.
# With an alphabetical word_file, as all of the google sourced ones are, the elif will save some time.
def read_word_file(word, year_range, word_file):
word_found = False
years = []
counts = []
# Using the with as construction automates the f.close() operation
with open(word_file, "rb") as f:
csv_reader = csv.reader(f, delimiter='\t')
for row in csv_reader:
if row[0] == word and int(row[1]) >= year_range[0] and int(row[1]) <= year_range[1]:
word_found = True
years += [int(row[1])]
counts += [int(row[2])]
elif word_found: ## comment out this block if you're not sure that your input is alphabetical.
break
return years, counts
# Creates a dictionary indexed on years, where each value is the total
# number of words recorded from that year
def read_total_counts(total_file):
count_dict = {}
with open(total_file, "rb") as f:
csv_reader = csv.reader(f, delimiter=',')
for row in csv_reader:
count_dict[int(row[0])] = int(row[1])
return count_dict
# Reads the file specified in word_file and returns a dictionary of words mapped to
# lists of tuples, wheere each tuple is a year and count pair for that word.
def read_entire_word_file(word_file):
word_data = {}
with open(word_file, 'rb') as f:
csv_reader = csv.reader(f, delimiter='\t')
for row in csv_reader:
word_data[row[0]] = word_data.get(row[0], []) + [(int(row[1]), int(row[2]))]
return word_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment