Created
January 19, 2018 16:58
-
-
Save pascalwhoop/3c737256fc24366b1f20d21b2f3e890b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Analysing my Google Searches with Python | |
# In[60]: | |
import json | |
import os | |
# In[61]: | |
path = os.path.abspath(os.path.curdir) + "/Searches" | |
json_files = os.listdir(path) | |
# In[62]: | |
def parse_file(file_dict): | |
queries = [] | |
for q in file_dict.get('event'): | |
q = q.get('query') | |
q_tup = q.get('query_text'), q.get('id')[0].get('timestamp_usec') | |
queries.append(q_tup) | |
return queries | |
# In[63]: | |
import codecs | |
searches = [] | |
for file in json_files: | |
data = json.load(codecs.open(path + "/" + file, 'r', 'utf-8')) | |
queries = parse_file(data) | |
searches.extend(queries) | |
searches[100] | |
# Okay we got the queries in the file list. let's dig into them. | |
# | |
# This is what I want to build: | |
# | |
# 1. Overview of the frequency of searches | |
# - dot diagram searches / week | |
# - bar diagram searches / week | |
# - distribution searches / day_of_week (Mo,Tue,Wed, ...) | |
# ## 1. Search frequency over the last years | |
# In[5]: | |
import datetime | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import matplotlib.patches as patches | |
import matplotlib.path as path | |
# preparing our timestamps | |
timestamps = [int(q[1][:-6]) for q in searches] | |
datetime_timestamps = [datetime.datetime.fromtimestamp(ts) for ts in timestamps] | |
len(datetime_timestamps) | |
# In[6]: | |
fig, ax = plt.subplots() | |
fig.set_size_inches(14,10) | |
timestamps.sort() | |
n, bins = np.histogram(timestamps, 400) | |
#datetime_timestamps.sort() | |
#n, bins = np.histogram(datetime_timestamps, 400) | |
# get the corners of the rectangles for the histogram | |
left = np.array(bins[:-1]) | |
right = np.array(bins[1:]) | |
bottom = np.zeros(len(left)) | |
top = bottom + n | |
# we need a (numrects x numsides x 2) numpy array for the path helper | |
# function to build a compound path | |
XY = np.array([[left, left, right, right], [bottom, top, top, bottom]]).T | |
# get the Path object | |
barpath = path.Path.make_compound_path_from_polys(XY) | |
# make a patch out of it | |
patch = patches.PathPatch(barpath) | |
ax.add_patch(patch) | |
# update the view limits | |
ax.set_xlim(left[0], right[-1]) | |
ax.set_ylim(bottom.min(), top.max()) | |
plt.show() | |
# ## 2. Search frequency per day of week / year | |
# | |
# Okay, so a bar chart grouped by Day of Week and then several year bars next to each other. How do I get the day of the week and then how do I make that be matched to a year? | |
# | |
# In[7]: | |
datetime_timestamps[0].isoweekday() | |
# In[8]: | |
year_dow_pairs = [[ts.year, ts.isoweekday()] for ts in datetime_timestamps] | |
year_dow_pairs[4] | |
# Let's create a list of years that we can then use as keys for a dictionary like so | |
# | |
# ``` | |
# dow_in_year_dict | |
# dict:years | |
# dow[] | |
# ``` | |
# In[9]: | |
#getting year dict | |
years = list(set([ts.year for ts in datetime_timestamps])) | |
years.sort() | |
dow_in_year_dict = {} | |
for year in years: | |
dow_in_year_dict[str(year)] = [0,0,0,0,0,0,0] | |
#placing days of week in arrays | |
for ts in year_dow_pairs: | |
key = str(ts[0]) | |
dow_in_year_dict[key][ts[1]-1] +=1 | |
dow_in_year_dict | |
# **And now to the bar chart** | |
# In[10]: | |
#defining a moving average function that we will use later | |
def moving_avg(old, new, count): | |
"""count means the number of previously included number of values in the old average""" | |
old = old * (count / (count+1)) | |
new = new * (1/(count+1)) | |
return old + new, count+1 | |
# In[11]: | |
#preparing the plotting | |
# data to plot | |
n_groups = 7 | |
# create plot | |
fig, ax = plt.subplots() | |
fig.set_size_inches(14,10) | |
index = np.arange(n_groups) | |
bar_width = 0.1 | |
opacity = 0.8 | |
colors = ['b', 'g','r','c','m','k', 'y', 'brown'] | |
#placing the bars in the plt object | |
rects = [] | |
i = 0 | |
for year in years: | |
year = str(year) | |
vals = dow_in_year_dict[year] | |
#Let's get the values to be averages | |
vals = np.array(vals) / 52 | |
#plot it | |
rect = plt.bar(index+i*bar_width, vals, bar_width,alpha=opacity,color=colors[i],label=year) | |
rects.append(rect) | |
i += 1 | |
#Let's add an average plot over all years starting with 2009 | |
averages = np.array(dow_in_year_dict[str(years[0])]) | |
yr_count = 1 | |
for year in years: | |
new_vals = np.array(list(dow_in_year_dict[str(year)])) | |
#moving average calculation | |
averages, yr_count = moving_avg(averages, new_vals, yr_count) | |
averages = averages / 52 | |
rect = plt.bar(index+i*bar_width, averages, bar_width,alpha=opacity,color='orange',label='average') | |
#And finally plotting | |
plt.xlabel('Days of Week') | |
plt.ylabel('Count') | |
plt.title('Searches by day of week') | |
plt.xticks(index + bar_width, ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun')) | |
plt.legend() | |
plt.tight_layout() | |
plt.show() | |
# Interesting. Let's look at the data. Apparently I started Googling in 2009 (at least that's when I got my account) and steadily increased my search frequency, up to | |
# | |
# | |
# ## Time of days | |
# In[12]: | |
#getting year dict | |
hour_in_year_dict = {} | |
for year in years: | |
hour_in_year_dict[str(year)] = np.zeros(24) | |
#iterating over the items and counting the month occurences | |
for ts in datetime_timestamps: | |
yr = str(ts.year) | |
h = ts.hour | |
hour_in_year_dict[yr][h] +=1 | |
print(hour_in_year_dict['2014']) | |
# In[13]: | |
# data to plot | |
n_groups = 24 | |
# create plot | |
fig, ax = plt.subplots() | |
fig.set_size_inches(14,10) | |
index = np.arange(n_groups) | |
bar_width = 0.1 | |
opacity = 0.8 | |
colors = ['b', 'g','r','c','m','k', 'y', 'brown'] | |
rects = [] | |
i = 0 | |
for year in years: | |
year = str(year) | |
vals = hour_in_year_dict[year] | |
#lets average it out to a day | |
vals = np.array(vals) / 365 | |
rect = plt.bar(index+i*bar_width, vals, bar_width,alpha=opacity,color=colors[i],label=year) | |
rects.append(rect) | |
i += 1 | |
plt.xlabel('Hours of Day') | |
plt.ylabel('Count') | |
plt.title('Searches by day of week') | |
plt.xticks(index + bar_width, tuple(np.arange(n_groups)+1)) | |
plt.legend() | |
plt.tight_layout() | |
plt.show() | |
# Very interesting. It seems like 2016, I really spent a lot of time hacking around at night. Or could it be that Google timestamps the search queries to my home country? I spent 3 months in Australia in 2016, around 10 hours shifted ahead of my UTC+1 standard timezone. | |
# | |
# Still, it's impressive. In my strongest times around 2015, I sent about 4 queries PER HOUR to Google during the afternoon. That is a LOT of Googling | |
# # 2. part: Looking at the TERMS | |
# In[14]: | |
from collections import Counter | |
terms = [t[0] for t in searches] | |
term_counts = Counter(terms) | |
term_counts = list(term_counts.items()) | |
term_counts.sort(key=lambda tup: tup[1],reverse=True) | |
for term in term_counts[0:30]: | |
print(term) | |
# Okay but that is looking at every unique form. Let's clean that up a bit. Split them by space and remove any special characters | |
# In[15]: | |
import re | |
def cleanup_term(term): | |
words = re.findall('[\w]+', term) | |
return [w.lower() for w in words] | |
clean_terms = [cleanup_term(t) for t in terms] | |
print(len(clean_terms)) | |
terms_singles = [] | |
list(map(terms_singles.extend, clean_terms)) | |
print(len(terms_singles)) | |
singles_counted = list(Counter(terms_singles).items()) | |
singles_counted.sort(key=lambda tup: tup[1], reverse=True) | |
singles_counted[0:40] | |
# In[16]: | |
from wordcloud import WordCloud | |
wordcloud = WordCloud().generate(" ".join(terms_singles)) | |
# In[51]: | |
plt.axis("off") | |
fig = plt.figure() | |
fig.set_size_inches(21,9) | |
# lower max_font_size | |
wordcloud = WordCloud(max_font_size=80, width=2100, height=900, colormap='prism').generate(" ".join(terms_singles)) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis("off") | |
# Okay there is "Copenhagen" that is curious. I wonder why! | |
# In[40]: | |
search_term = 'pascal'.lower() | |
count = 0 | |
for term in terms: | |
if search_term in term.lower(): | |
count +=1 | |
print(term) | |
print("TOTAL: " + str(count)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment