Skip to content

Instantly share code, notes, and snippets.

@controversial
Last active January 26, 2016 23:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save controversial/9de0cddd41613ce03dc5 to your computer and use it in GitHub Desktop.
Save controversial/9de0cddd41613ce03dc5 to your computer and use it in GitHub Desktop.
Plot the 25 most common domains found in Safari's history cache using matplotlib
import os
import urllib2
from collections import Counter
#FETCH DATA
files=os.listdir(os.path.expanduser("~/Library/Caches/Metadata/Safari/History"))
webpages=[f for f in files if f.startswith("http")] #https also starts with http
print "Found {} pieces of data".format(len(webpages))
webpages=[f[:-11] for f in webpages] #Remove .webhistory extension
visited=[urllib2.unquote(w) for w in webpages] #Undo HTML URL encoding
#Now we have a list of all the webpages stored in history.
domains=[urllib2.urlparse.urlparse(page).netloc for page in visited] #parse each URL to find the domain it's from.
count=Counter(domains)
tops=count.most_common(25) #Top 25 visited domains
#HISTOGRAM
from matplotlib import pyplot as plt
items,counts=zip(*tops) #List of items, list of times each appears
indices=range(len(tops)) #Index for each item
#Graph the graph
graph=plt.bar(indices, counts, 1, edgecolor="#FF3300", facecolor="#FF7700")
#Label stuff
xvals=[i+0.5 for i in indices] #Where each label will appear
plt.xticks(xvals, items,rotation="vertical") #Label the bars
plt.title("Top 25 Domains") # Title the graph
#Label the tops of the bars
for rect in graph:
x=rect.get_x()+0.6
y=rect.get_height() + 50
label=str(rect.get_height())
plt.text(x,y,label,ha="center",va="bottom",rotation="vertical")
#Increase spacing
plt.ylim((0.0,4000.0))
#Save an image
plt.savefig("topsites.png",bbox_inches="tight")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment