Skip to content

Instantly share code, notes, and snippets.

@six5532one
Created August 9, 2013 20:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save six5532one/6197080 to your computer and use it in GitHub Desktop.
Save six5532one/6197080 to your computer and use it in GitHub Desktop.
Zipf's Law of the Internet
import matplotlib.pyplot as plt
import pylab
import numpy as np
import math
import csv
import random
import datetime
import pymongo
from random import randint, uniform
from scipy.stats import pearsonr
from collections import OrderedDict
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Ellipse
from matplotlib import font_manager
def get_frequencies(date):
"""Get referral frequencies and rankings of referring domains."""
frequencies = [doc["hourly_pvs"]["total"] for doc in collection.find({"date": date, "referring_domain": {"$ne": "__alldomains__"}})]
frequencies.sort(reverse=True)
return np.array(frequencies)
def get_ppf_list(percentile_list, values):
""" Find distribution of network referrals from domains."""
return OrderedDict([(percentile, get_ppf(percentile, values)) for percentile in percentile_list])
def get_ppf(percentile, values):
# scipy.stats.zipf.ppf(percentile, b=1.1) stalled, so use this instead since referrals is a discrete r.v.
mass = percentile * sum(values)
n = 1
while (n <= values.size):
if sum(values[:n]) > mass:
return n-1
n += 1
return None
def get_rand_date(start, end):
days_in_window = (end - start).days
random_date = start + datetime.timedelta(randint(0, days_in_window - 1))
return datetime.datetime.strptime("2013-03-10","%Y-%m-%d")
def get_top_referrers(n, frequencies, date):
"""Return top n referring domains on a given date."""
return [collection.find_one({"date": date, "hourly_pvs.total": int(frequencies[rank])})["referring_domain"] for rank in range(n)]
def get_radius(area):
"""Return radius of a circle with the given area."""
radius = math.sqrt(area/math.pi)
assert (math.pi * radius**2) - area < 1, "Returned incorrect radii for ellipse"
return radius
def get_ellipses(data):
"""Return a list of Ellipse patches with specified areas."""
ellipses = []
for area in data:
radius = get_radius(area)
alpha = float(random.randint(5,9))/10
facecolor = pylab.rand(3)
ellipse = Ellipse(xy=pylab.rand(2)*axes_lim, width=radius, height=radius)
ellipse.set_alpha(alpha)
ellipse.set_facecolor(facecolor)
ellipses.append(ellipse)
return ellipses
def add_patches(axis, patches):
for patch in patches:
axis.add_artist(patch)
patch.set_clip_box(axis.bbox)
connection = pymongo.MongoClient()
db = connection.parsely_insights
collection = db.refdomain_destinations
# get a random date from 2013-01-01 to 2013-07-23"
today = datetime.datetime.today()
start = datetime.datetime.strptime("2013-01-01", "%Y-%m-%d")
random_date = get_rand_date(start, today)
# plot frequencies of referrals to network, rankings on log-log scale
frequencies = get_frequencies(random_date)
plt.plot(np.log((np.arange(frequencies.size) + 1)), np.log(frequencies), color="k", linewidth=2.0)
plt.title("Distribution of pageviews referred to the Parse.ly content network \nfrom other websites on {} \n(note use of log-log scale)".format(random_date.strftime("%Y-%m-%d")), fontsize="x-large", weight="medium", family="Arial")
plt.xlabel("Referring Domain (Rank-Ordered)", family="Arial", fontsize="large")
plt.ylabel("Number of Pageviews Referred", family="Arial", fontsize="large")
# Display the value of -b as it is used in Zipf's Law (log R = a - b log n)
print "\nPearson correlation coefficient: {}".format(pearsonr(np.log((np.arange(frequencies.size) + 1)), np.log(frequencies)))
plt.show()
# visualize distribution of pageviews referred by each referring domain
fig = pylab.figure(figsize=(22, 16))
ax = fig.add_subplot(111, aspect="equal")
scaling_factor = 0.75
axes_lim = math.ceil(scaling_factor * math.sqrt(sum(frequencies)))
ax.set_xlim(0, axes_lim)
ax.set_ylim(0, axes_lim)
ax.set_xticks([])
ax.set_yticks([])
n = 10 #the number of referring domains to display on the legend
top_referrers = get_top_referrers(n, frequencies, random_date)
percentiles_to_visualize = [0.75, 0.95, 0.99]
ppf_values = get_ppf_list(percentiles_to_visualize, frequencies)
reference_points = get_ellipses(frequencies)
# divide the patches into groups that can be displayed sequentially
boundaries = [0] + [ppf_values[percentile] for percentile in percentiles_to_visualize]
for index, (percentile, value) in enumerate(zip(percentiles_to_visualize, boundaries[:-1])):
add_patches(ax, reference_points[value:boundaries[index+1]])
# display top referrers in legend
plt.legend(reference_points[:n], top_referrers, bbox_to_anchor=(1.3, 0.75))
font = font_manager.FontProperties(family="Arial", size=24)
map(lambda x: x.set_font_properties(font), ax.get_legend().get_texts())
plt.title("Domains referring top {}% of network traffic\n(area of circle proportional to number\nof network referrals from domain)".format(int(100 * percentile)), family="Arial", fontsize="x-large", size=32)
plt.savefig("domaincircles_{}".format(index))
# display network traffic distribution for top quartile
fig = plt.figure()
ax = fig.add_subplot(111)
percentiles = [0.75, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99]
domain_ppfs = get_ppf_list(percentiles, frequencies)
x = np.arange(len(percentiles))
y = np.array(domain_ppfs.values())
ax.set_xticklabels([str(int(100 * percentile)) for percentile in percentiles], family="Arial")
ax.set_xticks(x + 0.5)
plt.bar(x, y, color="k", alpha=0.25)
ax.set_xlabel("Top n percent of total network pageviews", fontsize="large", family="Arial")
ax.set_ylabel("Number of referring domains", fontsize="large", family="Arial")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment