Created
August 30, 2017 16:47
-
-
Save katejim/c7ca9befa55992435741910f0deb3e4b to your computer and use it in GitHub Desktop.
Investigation high ration of health ping clients
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
import pandas as pd | |
from plotly.graph_objs import * | |
from moztelemetry import get_pings_properties, get_one_ping_per_client | |
from moztelemetry.dataset import Dataset | |
from collections import Counter | |
import operator | |
from operator import itemgetter | |
get_ipython().magic(u'matplotlib inline') | |
# In[ ]: | |
pings = Dataset.from_source("telemetry") .where(docType='health', appUpdateChannel="beta") .records(sc, sample=1) | |
cachedData = get_pings_properties(pings, ["creationDate", "payload/pingDiscardedForSize", "payload/sendFailure", | |
"clientId", "meta/submissionDate", "payload/os", "payload/reason", | |
"meta/geoCountry", "application/version"]).cache() | |
# In[ ]: | |
import matplotlib.dates as mdates | |
def plotlistofTuples(listOfTuples, title="", inColor='blue'): | |
keys = [t[0] for t in listOfTuples] | |
values = [t[1] for t in listOfTuples] | |
plt.figure(1) | |
fig = plt.gcf() | |
fig.set_size_inches(15, 7) | |
plt.title(title) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) | |
plt.bar(range(len(listOfTuples)), values, align='center', color=inColor) | |
plt.xticks(range(len(listOfTuples)), keys, rotation=90) | |
# # Correlation between sendFailure count and Health ping reason | |
# analyse only sendFailure, as discardedForSize bring small amount of problems | |
# In[11]: | |
def getKey(dictionary, key): | |
return dictionary.get(key) | |
# In[12]: | |
def distrReasonToFailureCount(data, key): | |
return data.map(lambda p: (p['payload/reason'], p['payload/sendFailure'])) .filter(lambda pair: pair[1] != None) .map(lambda pair: (pair[0], getKey(pair[1], key))) .filter(lambda pair: pair[1] != None) .groupByKey() .map(lambda pair: (pair[0], sum(pair[1]))) .collect() | |
# In[17]: | |
plotlistofTuples(distrReasonToFailureCount(cachedData, 'eChannelOpen'), title='eChannelOpen Failures distribution per reason') | |
plt.show() | |
# Most pings with reason with sendFailure = "eChannelOpen" (Can't open channel error) have reason "shutdown", which could be because network is shouted down before getting signal of shutdown on client side. | |
# In[18]: | |
plotlistofTuples(distrReasonToFailureCount(cachedData, 'eUnreachable'), title='eUnreacheable Failures distribution per reason') | |
plt.show() | |
# In[20]: | |
plotlistofTuples(distrReasonToFailureCount(cachedData, 'timeout'), title='timeout Failures distribution per reason') | |
plt.show() | |
# # OS distribution Beta | |
# In[34]: | |
def osDistr(cachedData): | |
distr = cachedData.map(lambda p: p['payload/os']).filter(lambda p: p != None).map(lambda p: p.get('name')) | |
result = Counter(distr.collect()).items() | |
plotlistofTuples(result) | |
print result | |
# In[35]: | |
osDistr(cachedData) | |
# Most of the pings submitted from Windows. Probably if we have networking problems, it is somehow connected to os. | |
# # Client distribution by country distribution | |
# In[4]: | |
def unique_by_key(elements, key=None): | |
if key is None: | |
# no key: the whole element must be unique | |
key = lambda e: e | |
return {key(el): el for el in elements}.values() | |
# In[11]: | |
date = cachedData.map(lambda p: (p["meta/geoCountry"], p["clientId"])).groupByKey() | |
result = sorted(date.map(lambda p: (p[0], len(set(p[1])))).collect(), key=itemgetter(1)) | |
# In[ ]: | |
pingsPerDaySeries = pd.Series([y for x, y in result]) | |
quantile05 = pingsPerDaySeries.quantile(0.75) | |
plotlistofTuples([(x, y) for x, y in result if y > quantile05]) | |
plt.title("Health ping users per country distribution (Plot represents only conuntries with uusers more that 0.5 quantile)") | |
# In[ ]: | |
quantile75 = pingsPerDaySeries.quantile(0.75) | |
plotlistofTuples([(x, y) for x, y in usersPerClient if y > quantile05]) | |
plt.title("Health ping users per country distribution (Plot represents only conuntries with uusers more that 0.75 quantile)") | |
# In[ ]: | |
Most of the clients submitting health png from India (not US), | |
but turns out that currenlty in Beta we have more users from India. https://sql.telemetry.mozilla.org/queries/27098/source | |
# In[ ]: | |
pings.first() | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment