Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save georgf/372b442487e1752081c75314d276203d to your computer and use it in GitHub Desktop.
Save georgf/372b442487e1752081c75314d276203d to your computer and use it in GitHub Desktop.
Current histogram & scalar payload sizes
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import datetime as dt
import ujson as json
import pandas as pd
import numpy as np
import copy as cp
import matplotlib.pyplot as plt
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from moztelemetry.dataset import Dataset
get_ipython().magic(u'matplotlib inline')
# # Load ping data
# Get a bunch of pings from the last nightly & release Firefox.
# In[2]:
dataset = Dataset.from_source('telemetry')
# In[3]:
dataset.schema
# In[4]:
records_nightly = Dataset.from_source('telemetry') .where(docType='main') .where(appUpdateChannel='nightly') .where(submissionDate=lambda x: x.startswith('201804')) .where(appVersion=lambda x: x.startswith('61.')) .records(sc, sample=0.1)
records_nightly.count()
# In[5]:
records_release = Dataset.from_source('telemetry') .where(docType='main') .where(appUpdateChannel='release') .where(submissionDate=lambda x: x.startswith('201804')) .where(appVersion=lambda x: x.startswith('59.')) .records(sc, sample=0.01)
records_release.count()
# # Helper functions
# Define the functions to extract the fields from each ping and map them to their json length in bytes.
# In[6]:
def get_from_ping(ping, path):
try:
return reduce(lambda d, k: d[k], path.split("/"), ping)
except (KeyError, IndexError):
return None
def extract_fields_size(ping):
field_list = [
"payload/histograms",
"payload/keyedHistograms",
"payload/processes/content/histograms",
"payload/processes/content/keyedHistograms",
"payload/processes/parent/scalars",
"payload/processes/parent/keyedScalars",
"payload/processes/content/scalars",
"payload/processes/content/keyedScalars",
]
# Build a tuple (field_name, json_field_size) for each field.
p = cp.deepcopy(ping)
tuples = [(e, len(json.dumps(get_from_ping(p, e)))) for e in field_list if get_from_ping(p, e)]
return tuples
def get_payload_size(ping):
field_sizes = extract_fields_size(ping)
return sum([t[1] for t in field_sizes])
# In[7]:
extract_fields_size(records_nightly.first())
# In[8]:
get_payload_size(records_nightly.first())
# In[9]:
extract_fields_size(records_release.first())
# In[10]:
get_payload_size(records_release.first())
# # Get Nightly size data
# In[12]:
sizes_nightly = pd.Series(records_nightly.map(get_payload_size).collect())
# In[13]:
(sizes_nightly / 1024).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999])
# In[26]:
(sizes_nightly / 1024).hist()
plt.xlabel('Size in kb')
plt.ylabel('Frequency')
plt.title('Nightly histogram & scalar payload size')
# # Get Release size data
# In[15]:
sizes_release = pd.Series(records_release.map(get_payload_size).collect())
# In[16]:
(sizes_release / 1024).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999])
# In[25]:
(sizes_release / 1024).hist()
plt.xlabel('Size in kb')
plt.ylabel('Frequency')
plt.title('Release histogram & scalar payload size')
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment