Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save katejim/c7ca9befa55992435741910f0deb3e4b to your computer and use it in GitHub Desktop.
Save katejim/c7ca9befa55992435741910f0deb3e4b to your computer and use it in GitHub Desktop.
Investigation high ration of health ping clients
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import pandas as pd
from plotly.graph_objs import *
from moztelemetry import get_pings_properties, get_one_ping_per_client
from moztelemetry.dataset import Dataset
from collections import Counter
import operator
from operator import itemgetter
get_ipython().magic(u'matplotlib inline')
# In[ ]:
pings = Dataset.from_source("telemetry") .where(docType='health', appUpdateChannel="beta") .records(sc, sample=1)
cachedData = get_pings_properties(pings, ["creationDate", "payload/pingDiscardedForSize", "payload/sendFailure",
"clientId", "meta/submissionDate", "payload/os", "payload/reason",
"meta/geoCountry", "application/version"]).cache()
# In[ ]:
import matplotlib.dates as mdates
def plotlistofTuples(listOfTuples, title="", inColor='blue'):
keys = [t[0] for t in listOfTuples]
values = [t[1] for t in listOfTuples]
plt.figure(1)
fig = plt.gcf()
fig.set_size_inches(15, 7)
plt.title(title)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
plt.bar(range(len(listOfTuples)), values, align='center', color=inColor)
plt.xticks(range(len(listOfTuples)), keys, rotation=90)
# # Correlation between sendFailure count and Health ping reason
# analyse only sendFailure, as discardedForSize bring small amount of problems
# In[11]:
def getKey(dictionary, key):
return dictionary.get(key)
# In[12]:
def distrReasonToFailureCount(data, key):
return data.map(lambda p: (p['payload/reason'], p['payload/sendFailure'])) .filter(lambda pair: pair[1] != None) .map(lambda pair: (pair[0], getKey(pair[1], key))) .filter(lambda pair: pair[1] != None) .groupByKey() .map(lambda pair: (pair[0], sum(pair[1]))) .collect()
# In[17]:
plotlistofTuples(distrReasonToFailureCount(cachedData, 'eChannelOpen'), title='eChannelOpen Failures distribution per reason')
plt.show()
# Most pings with reason with sendFailure = "eChannelOpen" (Can't open channel error) have reason "shutdown", which could be because network is shouted down before getting signal of shutdown on client side.
# In[18]:
plotlistofTuples(distrReasonToFailureCount(cachedData, 'eUnreachable'), title='eUnreacheable Failures distribution per reason')
plt.show()
# In[20]:
plotlistofTuples(distrReasonToFailureCount(cachedData, 'timeout'), title='timeout Failures distribution per reason')
plt.show()
# # OS distribution Beta
# In[34]:
def osDistr(cachedData):
distr = cachedData.map(lambda p: p['payload/os']).filter(lambda p: p != None).map(lambda p: p.get('name'))
result = Counter(distr.collect()).items()
plotlistofTuples(result)
print result
# In[35]:
osDistr(cachedData)
# Most of the pings submitted from Windows. Probably if we have networking problems, it is somehow connected to os.
# # Client distribution by country distribution
# In[4]:
def unique_by_key(elements, key=None):
if key is None:
# no key: the whole element must be unique
key = lambda e: e
return {key(el): el for el in elements}.values()
# In[11]:
date = cachedData.map(lambda p: (p["meta/geoCountry"], p["clientId"])).groupByKey()
result = sorted(date.map(lambda p: (p[0], len(set(p[1])))).collect(), key=itemgetter(1))
# In[ ]:
pingsPerDaySeries = pd.Series([y for x, y in result])
quantile05 = pingsPerDaySeries.quantile(0.75)
plotlistofTuples([(x, y) for x, y in result if y > quantile05])
plt.title("Health ping users per country distribution (Plot represents only conuntries with uusers more that 0.5 quantile)")
# In[ ]:
quantile75 = pingsPerDaySeries.quantile(0.75)
plotlistofTuples([(x, y) for x, y in usersPerClient if y > quantile05])
plt.title("Health ping users per country distribution (Plot represents only conuntries with uusers more that 0.75 quantile)")
# In[ ]:
Most of the clients submitting health png from India (not US),
but turns out that currenlty in Beta we have more users from India. https://sql.telemetry.mozilla.org/queries/27098/source
# In[ ]:
pings.first()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment