Skip to content

Instantly share code, notes, and snippets.

@nuria
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nuria/9052770 to your computer and use it in GitHub Desktop.
Save nuria/9052770 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python
import datetime as datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats
from pylab import *
import matplotlib.ticker as ticker
import pprint
# plot percentiles50 and 90 for data for which we have network activity
def main():
fileReader = csv.reader(open('navigationTimingData26Jan15FebWithConnectStart.txt','rb'),delimiter=',');
# timestamp
# country
# requestStart
# responseEnd
# domInteractive
# mediaWikiLoad
mediaWikiLoad = list()
tp50 = list()
tp90 = list()
tpBucket =list()
rs50 = list()
rs90 = list()
rsBucket = list()
dates = list()
latencies = list()
# responseStart - connectStart
tcp50 = list()
tcp90 = list()
tcpBucket = list()
dom50 = list()
dom90 = list()
domBucket = list()
timestamp = None;
oceania = set(['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP' ,'NC','NF','NR','NU','NZ','PF','PG','PN','PW','SB','TK', 'TO', 'TV', 'UM','VU', 'WF'] )
asia = set(['BD','BT','HK','ID','JP','KH','KP','KR','MM','MN','MO','MY','PH','SG','TH','TW','VN'])
northAmerica = set(['US','CA'])
label_oceania = "Ocenania"
label_northAmerica = "North America";
label_asia = "SE Asia";
label = label_asia
countries = asia
MIN_BUCKET_SIZE = 1000
for row in fileReader:
'''
result.append(day)
result.append(country)
result.append(domInteractive)
result.append(mediaWikiLoad)
result.append(dns)
result.append(event_connectStart)
result.append(event_connectEnd)
result.append(requestStart)
result.append(event_responseStart)
result.append(responseEnd)
'''
recordTimestamp = row[0]
country = row[1]
domInteractive = row[2]
mediaWikiLoadComplete = row[3]
dns = row[4]
connectStart = row[5]
connectEnd = row[6]
requestStart = row[7]
responseStart = row[8]
responseEnd = row[9]
# using data for which we have dns activity.
if dns == "NULL" or country not in countries:
continue
networkTime = 0
if responseStart!="NULL" and connectStart!="NULL":
networkTime = int(responseStart) - int(connectStart)
# if responseStart==responseEnd but dnsLookup > 0
# This indicates a local cache hit
# dnsLookup is equal to fetchStart when retrieving from local cache.
# See: https://dvcs.w3.org/hg/webperf/raw-file/tip/specs/NavigationTiming/Overview.html
responseTime = int(responseStart) - int(responseEnd)
# # some records report times that seem backwards
if responseTime == 0 or networkTime > int(mediaWikiLoadComplete):
continue
if timestamp != recordTimestamp or timestamp == None:
# calculate for the day
if timestamp != None and len(tpBucket) >= MIN_BUCKET_SIZE :
dates.append(timestamp)
tp50.append(stats.scoreatpercentile(sorted(tpBucket),50))
tp90.append(stats.scoreatpercentile(sorted(tpBucket),90))
rs50.append(stats.scoreatpercentile(sorted(rsBucket),50))
rs90.append(stats.scoreatpercentile(sorted(rsBucket),90))
tcp50.append(stats.scoreatpercentile(sorted(tcpBucket),50))
tcp90.append(stats.scoreatpercentile(sorted(tcpBucket),90))
dom50.append(stats.scoreatpercentile(sorted(domBucket),50))
dom90.append(stats.scoreatpercentile(sorted(domBucket),90))
print timestamp
print "bucket len"
print len(tpBucket)
# reset timestamp
timestamp = recordTimestamp
tpBucket = list()
rsBucket = list()
tpcBucket = list()
domBucket = list()
if requestStart != "NULL":
rsBucket.append(int(requestStart))
if networkTime != 0:
tcpBucket.append(networkTime)
tpBucket.append(int(mediaWikiLoadComplete))
latencies.append(int(mediaWikiLoadComplete))
domBucket.append(int(domInteractive))
# last record
if len(tpBucket) >= MIN_BUCKET_SIZE:
dates.append(recordTimestamp)
tp50.append(stats.scoreatpercentile(sorted(tpBucket),50))
tp90.append(stats.scoreatpercentile(sorted(tpBucket),90))
rs50.append(stats.scoreatpercentile(sorted(rsBucket),50))
rs90.append(stats.scoreatpercentile(sorted(rsBucket),90))
tcp50.append(stats.scoreatpercentile(sorted(tcpBucket),50))
tcp90.append(stats.scoreatpercentile(sorted(tcpBucket),90))
dom50.append(stats.scoreatpercentile(sorted(domBucket),50))
dom90.append(stats.scoreatpercentile(sorted(domBucket),90))
print timestamp
print "bucket len";
print len(tpBucket)
#Transform a list into arrays matplotlib uses
tp50_np = np.array(tp50)
tp90_np = np.array(tp90)
rs50_np = np.array(rs50)
rs90_np = np.array(rs90)
tcp50_np = np.array(tcp50)
tcp90_np = np.array(tcp90)
dom50_np = np.array(dom50)
dom90_np = np.array(dom90)
tpDatesFriendly = []
tpDates = []
for d in dates:
# converts this into matplotlib internal floating point representation
_date = md.datestr2num(d)
tpDates.append(_date)
tpDatesFriendly.append(datetime.datetime.strptime(d, "%Y%m%d%H%M"))
dates_np = np.array(tpDates);
# matplotlib date format object
hfmt = md.DateFormatter('%m/%d')
f = figure();
plt.title('Page Load Latencies '+label+'.\n Cold Cache ')
plt.plot_date(dates_np, tp50_np,'b-', label='percentile 50');
plt.plot_date(dates_np, tp90_np, 'r-', label='percentile 90');
plt.legend(loc='right');
plt.ylabel('Time (ms)')
plt.grid(True)
# bizarre matplotlib date format
# 735265. 735266. 735267. 735268. 735269. 735270. 735271. 735272
# Feb 1 2 3 4 5 6 7 8
ulsfo_time = 735269
plt.axvline(x=ulsfo_time, linewidth=2, color='m',linestyle='dashed')
plt.autoscale(True)
a = gca()
#a.xaxis.set_major_locator( DayLocator() )
a.xaxis.set_major_formatter(hfmt)
#f.autofmt_xdate()
plt.show()
plt.title('ResponseStart - ConnectStart. '+label+'.\nNetwork time until first byte minus DNS lookup.')
plt.plot_date(dates_np, tcp50_np, 'b-', label='percentile 50');
plt.plot_date(dates_np, tcp90_np, 'r-', label='percentile 90');
plt.legend(loc='right');
plt.ylabel('Time (ms)')
plt.grid(True)
plt.axvline(x=ulsfo_time, linewidth=2, color='m', linestyle='dashed')
plt.autoscale(True)
a = gca()
a.xaxis.set_major_formatter(hfmt)
plt.show()
plt.title('DOMContentInteractive '+label)
plt.plot_date(dates_np, dom50_np, 'b-', label='percentile 50');
plt.plot_date(dates_np, dom90_np, 'r-', label='percentile 90');
plt.legend(loc='right');
plt.ylabel('Time (ms)')
plt.grid(True)
plt.axvline(x=ulsfo_time, linewidth=2, color='m', linestyle='dashed')
plt.autoscale(True)
a = gca()
a.xaxis.set_major_formatter(hfmt)
plt.show()
if __name__=="__main__":
main()
#!/usr/local/bin/python
# calculates per country weekly percentiles
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.dates as md
import csv
import datetime
from scipy import stats
from pylab import *
import matplotlib.ticker as ticker
import pprint
import datetime;
def calculatePercentile(bucket,percentile=50):
return stats.scoreatpercentile(sorted(bucket),percentile)
# calculate weekly percentiles for the week before, during and after of ulsfo deployment
def main():
fileReader = csv.reader(open('navigationTimingData26Jan15FebWithConnectStart.txt','rb'),delimiter=',');
tp50 = {}
sampleSize ={}
tpBucket = {}
# initialize to whatever
nextWeek = None;
# intervals
# 26th Jan to 1st Feb
# 2nd Feb to 8th Feb
# 9th Feb to 15th feb
# week1_time = 201401260000;
#week2_time = 201402010000;
#week3_time = 201402080000;
#week4_time = 201402150000;
oceania = set(['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP' ,'NC','NF','NR','NU','NZ','PF','PG','PN','PW','SB','TK', 'TO', 'TV', 'UM','VU', 'WF'] )
asia = set(['BD','BT','HK','ID','JP','KH','KP','KR','MM','MN','MO','MY','PH','SG','TH','TW','VN'])
northAmerica = set(['US','CA'])
ULSFOCountries = oceania.union(asia,northAmerica)
# this is a small sample size for 3 days of data
# 300+ per day
MIN_BUCKET_SIZE = 1000;
for row in fileReader:
'''
result.append(day)
result.append(country)
result.append(domInteractive)
result.append(mediaWikiLoad)
result.append(dns)
result.append(event_connectStart)
result.append(event_connectEnd)
result.append(requestStart)
result.append(event_responseStart)
result.append(responseEnd)
'''
recordTimestamp = datetime.datetime.strptime(row[0], "%Y%m%d%H%M")
country = row[1]
domInteractive = row[2]
mediaWikiLoadComplete = row[3]
dns = row[4]
connectStart = row[5]
connectEnd = row[6]
requestStart = row[7]
responseStart = row[8]
responseEnd = row[9]
#if country in ULSFOCountries:
# continue
# the very 1st timestamp starts our week count
if nextWeek == None:
nextWeek = recordTimestamp + datetime.timedelta(days=6)
if recordTimestamp > nextWeek:
print "changing weeks at {0}".format(recordTimestamp)
# changed weeks
# calculate tp50 for the week
# for all countries
for country in tpBucket.keys():
if tp50.get(country) == None:
tp50[country] = []
sampleSize[country] = []
p = calculatePercentile(tpBucket.get(country))
tp50.get(country).append(p)
sampleSize[country].append(len(tpBucket.get(country)))
# reset
tpBucket = {}
nextWeek = recordTimestamp + datetime.timedelta(days=6)
if tpBucket.get(country)==None:
tpBucket[country] = []
tpBucket[country].append(int(mediaWikiLoadComplete))
# last calculation
print "changing weeks at {0}".format(recordTimestamp)
for country in tpBucket.keys():
p = calculatePercentile(tpBucket.get(country))
tp50.get(country).append(p)
sampleSize[country].append(len(tpBucket.get(country)))
print '{| class="wikitable"'
print "|-"
print "!Country !! 50th pctl week1 01/26 (ms)!! 50th pctl week2 02/02 (ms) !! 50th pctl week3 09/02 (ms) !! Difference week1-week2 || Difference week1-week2"
print "|-"
for country in tp50.keys():
tp = tp50[country]
# we need to have been able to calculate three percentiles
if len(tp)==3:
# also weekly sample size per country has to be at least MIN_BUCKET_SIZE
numberOfSamples = min(sampleSize.get(country));
if numberOfSamples >= MIN_BUCKET_SIZE and country not in ULSFOCountries:
msg = "{0}".format(country)
for t in tp:
msg = msg +" || "+str(t)
difference1 = tp[0] - tp[1]
difference2 = tp[0] - tp[2]
print "| {0} || {1} || {2}".format(msg,difference1,difference2)
print "|-"
print "|}"
if __name__=="__main__":
main()
#!/usr/local/bin/python
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import csv
import datetime
from scipy import stats
from pylab import *
import matplotlib.ticker as ticker
import pprint
'''
"id",0
"uuid",1
"clientIp",2
"clientValidated",3
"isTruncated",4
"timestamp",5
"userAgent",6
"webHost",7
"wiki",8
"event_action",9
"event_connectEnd",10
"event_connectStart",11
"event_dnsLookup",12
"event_domComplete",13
"event_domInteractive",14
"event_fetchStart",15
"event_isAnon",16
"event_isHttps",17
"event_loadEventEnd",18
"event_loadEventStart",19
"event_mediaWikiLoadComplete",20
"event_mobileMode",21
"event_originCountry",22
"event_pageId",23
"event_redirectCount",24
"event_redirecting",25
"event_requestStart",26
"event_responseEnd",27
"event_responseStart",28
"event_revId",29
'''
# Process cvs file and consolidates second timestamps into daily timestamps
def main():
timing = []
fileReader = csv.reader(open('navigationTimingData26Jan15FebWithConnectStart.csv','rb'),delimiter=',',quotechar='"');
for row in fileReader:
timestamp = row[0]
requestStart = row[1]
responseEnd = row[2]
mediaWikiLoad = row[3]
domInteractive = row[4]
country = row[5]
dns = row[6]
event_connectStart = row[7]
event_responseStart = row[8]
event_connectEnd = row[9]
# "timestamp","event_requestStart","event_responseEnd","event_mediaWikiLoadComplete","event_domInteractive","event_originCountry","event_dnsLookup","event_connectStart","event_responseStart","event_connectEnd"
# filter outliers
if mediaWikiLoad == "NULL" or int(mediaWikiLoad) > 25000 or int(mediaWikiLoad) < 0:
continue
result = list()
day = timestamp[0:8]+'0000'
result.append(day)
result.append(country)
result.append(domInteractive)
result.append(mediaWikiLoad)
result.append(dns)
result.append(event_connectStart)
result.append(event_connectEnd)
result.append(requestStart)
result.append(event_responseStart)
result.append(responseEnd)
print "{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}".format(*result)
if __name__=="__main__":
main()
{
"width": 900,
"height": 500,
"viewport":[800,500],
"data": [
{
"name": "world",
"values": [
{"id":392, "rate": 328},
{"id":316, "rate": 0},
{"id":344, "rate": 93},
{"id":608, "rate": 301},
{"id":704, "rate": 218},
{"id":446, "rate": 0},
{"id":496, "rate": 0},
{"id":36, "rate": 329},
{"id":458, "rate": 307},
{"id":554, "rate": 261},
{"id":124, "rate": 0},
{"id":410, "rate": 353},
{"id":702, "rate": 289},
{"id":840, "rate": 41},
{"id":158, "rate": 235},
{"id":764, "rate": 120},
{"id":360, "rate": 385},
{"id":50, "rate": 0},
{"id":4, "rate": 0},
{"id":24, "rate": 0},
{"id":8, "rate": 0},
{"id":784, "rate": 0},
{"id":32, "rate": 0},
{"id":51, "rate": 0},
{"id":10, "rate": 0},
{"id":260, "rate": 0},
{"id":40, "rate": 0},
{"id":31, "rate": 0},
{"id":108, "rate": 0},
{"id":56, "rate": 0},
{"id":204, "rate": 0},
{"id":854, "rate": 0},
{"id":100, "rate": 0},
{"id":44, "rate": 0},
{"id":70, "rate": 0},
{"id":112, "rate": 0},
{"id":84, "rate": 0},
{"id":68, "rate": 0},
{"id":76, "rate": 0},
{"id":96, "rate": 0},
{"id":64, "rate": 0},
{"id":72, "rate": 0},
{"id":140, "rate": 0},
{"id":756, "rate": 0},
{"id":152, "rate": 0},
{"id":156, "rate": 0},
{"id":384, "rate": 0},
{"id":120, "rate": 0},
{"id":180, "rate": 0},
{"id":178, "rate": 0},
{"id":170, "rate": 0},
{"id":188, "rate": 0},
{"id":192, "rate": 0},
{"id":196, "rate": 0},
{"id":203, "rate": 0},
{"id":276, "rate": 0},
{"id":262, "rate": 0},
{"id":208, "rate": 0},
{"id":214, "rate": 0},
{"id":12, "rate": 0},
{"id":218, "rate": 0},
{"id":818, "rate": 0},
{"id":232, "rate": 0},
{"id":724, "rate": 0},
{"id":233, "rate": 0},
{"id":231, "rate": 0},
{"id":246, "rate": 0},
{"id":242, "rate": 0},
{"id":238, "rate": 0},
{"id":250, "rate": 0},
{"id":266, "rate": 0},
{"id":826, "rate": 0},
{"id":268, "rate": 0},
{"id":288, "rate": 0},
{"id":324, "rate": 0},
{"id":270, "rate": 0},
{"id":624, "rate": 0},
{"id":226, "rate": 0},
{"id":300, "rate": 0},
{"id":304, "rate": 0},
{"id":320, "rate": 0},
{"id":328, "rate": 0},
{"id":340, "rate": 0},
{"id":191, "rate": 0},
{"id":332, "rate": 0},
{"id":348, "rate": 0},
{"id":356, "rate": 0},
{"id":372, "rate": 0},
{"id":364, "rate": 0},
{"id":368, "rate": 0},
{"id":352, "rate": 0},
{"id":376, "rate": 0},
{"id":380, "rate": 0},
{"id":388, "rate": 0},
{"id":400, "rate": 0},
{"id":398, "rate": 0},
{"id":404, "rate": 0},
{"id":417, "rate": 0},
{"id":414, "rate": 0},
{"id":418, "rate": 0},
{"id":422, "rate": 0},
{"id":430, "rate": 0},
{"id":434, "rate": 0},
{"id":144, "rate": 0},
{"id":426, "rate": 0},
{"id":440, "rate": 0},
{"id":442, "rate": 0},
{"id":428, "rate": 0},
{"id":504, "rate": 0},
{"id":498, "rate": 0},
{"id":450, "rate": 0},
{"id":484, "rate": 0},
{"id":807, "rate": 0},
{"id":466, "rate": 0},
{"id":104, "rate": 0},
{"id":499, "rate": 0},
{"id":508, "rate": 0},
{"id":478, "rate": 0},
{"id":454, "rate": 0},
{"id":516, "rate": 0},
{"id":540, "rate": 0},
{"id":562, "rate": 0},
{"id":566, "rate": 0},
{"id":558, "rate": 0},
{"id":528, "rate": 0},
{"id":578, "rate": 0},
{"id":524, "rate": 0},
{"id":512, "rate": 0},
{"id":586, "rate": 0},
{"id":591, "rate": 0},
{"id":604, "rate": 0},
{"id":598, "rate": 0},
{"id":616, "rate": 0},
{"id":630, "rate": 0},
{"id":408, "rate": 0},
{"id":620, "rate": 0},
{"id":600, "rate": 0},
{"id":275, "rate": 0},
{"id":634, "rate": 0},
{"id":642, "rate": 0},
{"id":643, "rate": 0},
{"id":646, "rate": 0},
{"id":732, "rate": 0},
{"id":682, "rate": 0},
{"id":729, "rate": 0},
{"id":728, "rate": 0},
{"id":686, "rate": 0},
{"id":90, "rate": 0},
{"id":694, "rate": 0},
{"id":222, "rate": 0},
{"id":706, "rate": 0},
{"id":688, "rate": 0},
{"id":740, "rate": 0},
{"id":703, "rate": 0},
{"id":705, "rate": 0},
{"id":752, "rate": 0},
{"id":748, "rate": 0},
{"id":760, "rate": 0},
{"id":148, "rate": 0},
{"id":768, "rate": 0},
{"id":762, "rate": 0},
{"id":795, "rate": 0},
{"id":626, "rate": 0},
{"id":780, "rate": 0},
{"id":788, "rate": 0},
{"id":792, "rate": 0},
{"id":834, "rate": 0},
{"id":800, "rate": 0},
{"id":804, "rate": 0},
{"id":858, "rate": 0},
{"id":860, "rate": 0},
{"id":862, "rate": 0},
{"id":548, "rate": 0},
{"id":887, "rate": 0},
{"id":710, "rate": 0},
{"id":894, "rate": 0},
{"id":716, "rate": 0}
]
},
{
"name": "countries",
"url": "data/world-110m.json",
"format": {"type": "topojson", "feature": "countries"},
"transform": [
{
"type": "geopath", "value": "data", "projection": "winkel3",
"scale": 200,
"center": [40, 10]
},
{
"type": "zip",
"key": "data.id",
"with": "world",
"withKey": "data.id",
"as": "value",
"default": null
},
{"type":"filter", "test":"d.path!=null && d.value!=null"}
]
}
],
"legends": [
{
"title": "Improvement in Page Load Time (ms)",
"fill": "color",
"orient": "left",
"values": [40, 100, 150, 200, 250, 300, 350],
"offset": 0,
"properties": {
"title": {
"fontSize": {"value": 12}
},
"labels": {
"fontSize": {"value": 14}
},
"symbols": {
"stroke": {"value": "transparent"}
},
"legend": {
"y": {
"value": 380
},
"x": {
"value": 50
},
"stroke": {"value": "#ccc"},
"strokeWidth": {"value": 2}
}
}
}
],
"scales": [
{
"name": "color",
"type": "quantize",
"domain": [-50, 400],
"range": ["#e0e0e0","#c6dbef", "#9ecae1", "#6baed6",
"#4292c6", "#2171b5", "#08519c", "#08306b"]
}
],
"marks": [
{
"type": "path",
"from": {"data": "countries"},
"properties": {
"enter": { "path": {"field": "path"} },
"update": { "stroke": {"value": "#aaa"}, "fill": {"scale":"color", "field":"value.data.rate"} }
}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment