Skip to content

Instantly share code, notes, and snippets.

@pascalwhoop
Created January 19, 2018 16:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pascalwhoop/3c737256fc24366b1f20d21b2f3e890b to your computer and use it in GitHub Desktop.
Save pascalwhoop/3c737256fc24366b1f20d21b2f3e890b to your computer and use it in GitHub Desktop.
# coding: utf-8
# # Analysing my Google Searches with Python
# In[60]:
import json
import os
# In[61]:
path = os.path.abspath(os.path.curdir) + "/Searches"
json_files = os.listdir(path)
# In[62]:
def parse_file(file_dict):
queries = []
for q in file_dict.get('event'):
q = q.get('query')
q_tup = q.get('query_text'), q.get('id')[0].get('timestamp_usec')
queries.append(q_tup)
return queries
# In[63]:
import codecs
searches = []
for file in json_files:
data = json.load(codecs.open(path + "/" + file, 'r', 'utf-8'))
queries = parse_file(data)
searches.extend(queries)
searches[100]
# Okay we got the queries in the file list. let's dig into them.
#
# This is what I want to build:
#
# 1. Overview of the frequency of searches
# - dot diagram searches / week
# - bar diagram searches / week
# - distribution searches / day_of_week (Mo,Tue,Wed, ...)
# ## 1. Search frequency over the last years
# In[5]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as patches
import matplotlib.path as path
# preparing our timestamps
timestamps = [int(q[1][:-6]) for q in searches]
datetime_timestamps = [datetime.datetime.fromtimestamp(ts) for ts in timestamps]
len(datetime_timestamps)
# In[6]:
fig, ax = plt.subplots()
fig.set_size_inches(14,10)
timestamps.sort()
n, bins = np.histogram(timestamps, 400)
#datetime_timestamps.sort()
#n, bins = np.histogram(datetime_timestamps, 400)
# get the corners of the rectangles for the histogram
left = np.array(bins[:-1])
right = np.array(bins[1:])
bottom = np.zeros(len(left))
top = bottom + n
# we need a (numrects x numsides x 2) numpy array for the path helper
# function to build a compound path
XY = np.array([[left, left, right, right], [bottom, top, top, bottom]]).T
# get the Path object
barpath = path.Path.make_compound_path_from_polys(XY)
# make a patch out of it
patch = patches.PathPatch(barpath)
ax.add_patch(patch)
# update the view limits
ax.set_xlim(left[0], right[-1])
ax.set_ylim(bottom.min(), top.max())
plt.show()
# ## 2. Search frequency per day of week / year
#
# Okay, so a bar chart grouped by Day of Week and then several year bars next to each other. How do I get the day of the week and then how do I make that be matched to a year?
#
# In[7]:
datetime_timestamps[0].isoweekday()
# In[8]:
year_dow_pairs = [[ts.year, ts.isoweekday()] for ts in datetime_timestamps]
year_dow_pairs[4]
# Let's create a list of years that we can then use as keys for a dictionary like so
#
# ```
# dow_in_year_dict
# dict:years
# dow[]
# ```
# In[9]:
#getting year dict
years = list(set([ts.year for ts in datetime_timestamps]))
years.sort()
dow_in_year_dict = {}
for year in years:
dow_in_year_dict[str(year)] = [0,0,0,0,0,0,0]
#placing days of week in arrays
for ts in year_dow_pairs:
key = str(ts[0])
dow_in_year_dict[key][ts[1]-1] +=1
dow_in_year_dict
# **And now to the bar chart**
# In[10]:
#defining a moving average function that we will use later
def moving_avg(old, new, count):
"""count means the number of previously included number of values in the old average"""
old = old * (count / (count+1))
new = new * (1/(count+1))
return old + new, count+1
# In[11]:
#preparing the plotting
# data to plot
n_groups = 7
# create plot
fig, ax = plt.subplots()
fig.set_size_inches(14,10)
index = np.arange(n_groups)
bar_width = 0.1
opacity = 0.8
colors = ['b', 'g','r','c','m','k', 'y', 'brown']
#placing the bars in the plt object
rects = []
i = 0
for year in years:
year = str(year)
vals = dow_in_year_dict[year]
#Let's get the values to be averages
vals = np.array(vals) / 52
#plot it
rect = plt.bar(index+i*bar_width, vals, bar_width,alpha=opacity,color=colors[i],label=year)
rects.append(rect)
i += 1
#Let's add an average plot over all years starting with 2009
averages = np.array(dow_in_year_dict[str(years[0])])
yr_count = 1
for year in years:
new_vals = np.array(list(dow_in_year_dict[str(year)]))
#moving average calculation
averages, yr_count = moving_avg(averages, new_vals, yr_count)
averages = averages / 52
rect = plt.bar(index+i*bar_width, averages, bar_width,alpha=opacity,color='orange',label='average')
#And finally plotting
plt.xlabel('Days of Week')
plt.ylabel('Count')
plt.title('Searches by day of week')
plt.xticks(index + bar_width, ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'))
plt.legend()
plt.tight_layout()
plt.show()
# Interesting. Let's look at the data. Apparently I started Googling in 2009 (at least that's when I got my account) and steadily increased my search frequency, up to
#
#
# ## Time of days
# In[12]:
#getting year dict
hour_in_year_dict = {}
for year in years:
hour_in_year_dict[str(year)] = np.zeros(24)
#iterating over the items and counting the month occurences
for ts in datetime_timestamps:
yr = str(ts.year)
h = ts.hour
hour_in_year_dict[yr][h] +=1
print(hour_in_year_dict['2014'])
# In[13]:
# data to plot
n_groups = 24
# create plot
fig, ax = plt.subplots()
fig.set_size_inches(14,10)
index = np.arange(n_groups)
bar_width = 0.1
opacity = 0.8
colors = ['b', 'g','r','c','m','k', 'y', 'brown']
rects = []
i = 0
for year in years:
year = str(year)
vals = hour_in_year_dict[year]
#lets average it out to a day
vals = np.array(vals) / 365
rect = plt.bar(index+i*bar_width, vals, bar_width,alpha=opacity,color=colors[i],label=year)
rects.append(rect)
i += 1
plt.xlabel('Hours of Day')
plt.ylabel('Count')
plt.title('Searches by day of week')
plt.xticks(index + bar_width, tuple(np.arange(n_groups)+1))
plt.legend()
plt.tight_layout()
plt.show()
# Very interesting. It seems like 2016, I really spent a lot of time hacking around at night. Or could it be that Google timestamps the search queries to my home country? I spent 3 months in Australia in 2016, around 10 hours shifted ahead of my UTC+1 standard timezone.
#
# Still, it's impressive. In my strongest times around 2015, I sent about 4 queries PER HOUR to Google during the afternoon. That is a LOT of Googling
# # 2. part: Looking at the TERMS
# In[14]:
from collections import Counter
terms = [t[0] for t in searches]
term_counts = Counter(terms)
term_counts = list(term_counts.items())
term_counts.sort(key=lambda tup: tup[1],reverse=True)
for term in term_counts[0:30]:
print(term)
# Okay but that is looking at every unique form. Let's clean that up a bit. Split them by space and remove any special characters
# In[15]:
import re
def cleanup_term(term):
words = re.findall('[\w]+', term)
return [w.lower() for w in words]
clean_terms = [cleanup_term(t) for t in terms]
print(len(clean_terms))
terms_singles = []
list(map(terms_singles.extend, clean_terms))
print(len(terms_singles))
singles_counted = list(Counter(terms_singles).items())
singles_counted.sort(key=lambda tup: tup[1], reverse=True)
singles_counted[0:40]
# In[16]:
from wordcloud import WordCloud
wordcloud = WordCloud().generate(" ".join(terms_singles))
# In[51]:
plt.axis("off")
fig = plt.figure()
fig.set_size_inches(21,9)
# lower max_font_size
wordcloud = WordCloud(max_font_size=80, width=2100, height=900, colormap='prism').generate(" ".join(terms_singles))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
# Okay there is "Copenhagen" that is curious. I wonder why!
# In[40]:
search_term = 'pascal'.lower()
count = 0
for term in terms:
if search_term in term.lower():
count +=1
print(term)
print("TOTAL: " + str(count))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment