Created
August 9, 2013 20:50
-
-
Save six5532one/6197080 to your computer and use it in GitHub Desktop.
Zipf's Law of the Internet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import pylab | |
import numpy as np | |
import math | |
import csv | |
import random | |
import datetime | |
import pymongo | |
from random import randint, uniform | |
from scipy.stats import pearsonr | |
from collections import OrderedDict | |
from mpl_toolkits.mplot3d import Axes3D | |
from matplotlib.patches import Ellipse | |
from matplotlib import font_manager | |
def get_frequencies(date): | |
"""Get referral frequencies and rankings of referring domains.""" | |
frequencies = [doc["hourly_pvs"]["total"] for doc in collection.find({"date": date, "referring_domain": {"$ne": "__alldomains__"}})] | |
frequencies.sort(reverse=True) | |
return np.array(frequencies) | |
def get_ppf_list(percentile_list, values): | |
""" Find distribution of network referrals from domains.""" | |
return OrderedDict([(percentile, get_ppf(percentile, values)) for percentile in percentile_list]) | |
def get_ppf(percentile, values): | |
# scipy.stats.zipf.ppf(percentile, b=1.1) stalled, so use this instead since referrals is a discrete r.v. | |
mass = percentile * sum(values) | |
n = 1 | |
while (n <= values.size): | |
if sum(values[:n]) > mass: | |
return n-1 | |
n += 1 | |
return None | |
def get_rand_date(start, end): | |
days_in_window = (end - start).days | |
random_date = start + datetime.timedelta(randint(0, days_in_window - 1)) | |
return datetime.datetime.strptime("2013-03-10","%Y-%m-%d") | |
def get_top_referrers(n, frequencies, date): | |
"""Return top n referring domains on a given date.""" | |
return [collection.find_one({"date": date, "hourly_pvs.total": int(frequencies[rank])})["referring_domain"] for rank in range(n)] | |
def get_radius(area): | |
"""Return radius of a circle with the given area.""" | |
radius = math.sqrt(area/math.pi) | |
assert (math.pi * radius**2) - area < 1, "Returned incorrect radii for ellipse" | |
return radius | |
def get_ellipses(data): | |
"""Return a list of Ellipse patches with specified areas.""" | |
ellipses = [] | |
for area in data: | |
radius = get_radius(area) | |
alpha = float(random.randint(5,9))/10 | |
facecolor = pylab.rand(3) | |
ellipse = Ellipse(xy=pylab.rand(2)*axes_lim, width=radius, height=radius) | |
ellipse.set_alpha(alpha) | |
ellipse.set_facecolor(facecolor) | |
ellipses.append(ellipse) | |
return ellipses | |
def add_patches(axis, patches): | |
for patch in patches: | |
axis.add_artist(patch) | |
patch.set_clip_box(axis.bbox) | |
connection = pymongo.MongoClient() | |
db = connection.parsely_insights | |
collection = db.refdomain_destinations | |
# get a random date from 2013-01-01 to 2013-07-23" | |
today = datetime.datetime.today() | |
start = datetime.datetime.strptime("2013-01-01", "%Y-%m-%d") | |
random_date = get_rand_date(start, today) | |
# plot frequencies of referrals to network, rankings on log-log scale | |
frequencies = get_frequencies(random_date) | |
plt.plot(np.log((np.arange(frequencies.size) + 1)), np.log(frequencies), color="k", linewidth=2.0) | |
plt.title("Distribution of pageviews referred to the Parse.ly content network \nfrom other websites on {} \n(note use of log-log scale)".format(random_date.strftime("%Y-%m-%d")), fontsize="x-large", weight="medium", family="Arial") | |
plt.xlabel("Referring Domain (Rank-Ordered)", family="Arial", fontsize="large") | |
plt.ylabel("Number of Pageviews Referred", family="Arial", fontsize="large") | |
# Display the value of -b as it is used in Zipf's Law (log R = a - b log n) | |
print "\nPearson correlation coefficient: {}".format(pearsonr(np.log((np.arange(frequencies.size) + 1)), np.log(frequencies))) | |
plt.show() | |
# visualize distribution of pageviews referred by each referring domain | |
fig = pylab.figure(figsize=(22, 16)) | |
ax = fig.add_subplot(111, aspect="equal") | |
scaling_factor = 0.75 | |
axes_lim = math.ceil(scaling_factor * math.sqrt(sum(frequencies))) | |
ax.set_xlim(0, axes_lim) | |
ax.set_ylim(0, axes_lim) | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
n = 10 #the number of referring domains to display on the legend | |
top_referrers = get_top_referrers(n, frequencies, random_date) | |
percentiles_to_visualize = [0.75, 0.95, 0.99] | |
ppf_values = get_ppf_list(percentiles_to_visualize, frequencies) | |
reference_points = get_ellipses(frequencies) | |
# divide the patches into groups that can be displayed sequentially | |
boundaries = [0] + [ppf_values[percentile] for percentile in percentiles_to_visualize] | |
for index, (percentile, value) in enumerate(zip(percentiles_to_visualize, boundaries[:-1])): | |
add_patches(ax, reference_points[value:boundaries[index+1]]) | |
# display top referrers in legend | |
plt.legend(reference_points[:n], top_referrers, bbox_to_anchor=(1.3, 0.75)) | |
font = font_manager.FontProperties(family="Arial", size=24) | |
map(lambda x: x.set_font_properties(font), ax.get_legend().get_texts()) | |
plt.title("Domains referring top {}% of network traffic\n(area of circle proportional to number\nof network referrals from domain)".format(int(100 * percentile)), family="Arial", fontsize="x-large", size=32) | |
plt.savefig("domaincircles_{}".format(index)) | |
# display network traffic distribution for top quartile | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
percentiles = [0.75, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99] | |
domain_ppfs = get_ppf_list(percentiles, frequencies) | |
x = np.arange(len(percentiles)) | |
y = np.array(domain_ppfs.values()) | |
ax.set_xticklabels([str(int(100 * percentile)) for percentile in percentiles], family="Arial") | |
ax.set_xticks(x + 0.5) | |
plt.bar(x, y, color="k", alpha=0.25) | |
ax.set_xlabel("Top n percent of total network pageviews", fontsize="large", family="Arial") | |
ax.set_ylabel("Number of referring domains", fontsize="large", family="Arial") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment