Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dexterp37/f1e7ebc9d214b14bf4c154d0589c290c to your computer and use it in GitHub Desktop.
Save Dexterp37/f1e7ebc9d214b14bf4c154d0589c290c to your computer and use it in GitHub Desktop.
[1274975] missing clientId for main-pings
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[3]:
import datetime as dt
import matplotlib.pyplot as plt
import ujson as json
import pandas as pd
import numpy as np
import copy as cp
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
# ### How many main pings with no client id the week before 1233986 landed?
# In[4]:
build_ids = ("20160621000000", "20160628999999")
submission_dates = ("20160622", "20160628")
main_pings = get_pings(sc,
app="Firefox",
channel="nightly",
build_id=build_ids,
submission_date=submission_dates,
doc_type="main",
schema="v4",
fraction=1.0)
# In[5]:
main_pings_count = main_pings.count()
main_pings_count
# In[6]:
subset = get_pings_properties(main_pings, ["meta/clientId",
"meta/submissionDate"])
# In[7]:
main_pings_no_clientid = subset.filter(lambda p: not p.get("meta/clientId", None))
# In[8]:
no_clientid_count = main_pings_no_clientid.count()
no_clientid_count
# In[9]:
pings_per_client = subset.map(lambda p: ((p.get("meta/clientId"), p.get("meta/submissionDate")), 1)) .aggregateByKey(0, lambda a,b:a+b, lambda a,b:a+b)
ppc_df = pd.DataFrame(pings_per_client.values().collect())
ppc_df.describe()
# ### How many main pings with no client id the after 1233986 landed?
# In[10]:
build_ids = ("20160629000000", "20160705030222")
submission_dates = ("20160629", "20160706")
main_pings = get_pings(sc,
app="Firefox",
channel="nightly",
build_id=build_ids,
submission_date=submission_dates,
doc_type="main",
schema="v4",
fraction=1.0)
# In[11]:
main_pings_count = main_pings.count()
main_pings_count
# In[12]:
subset = get_pings_properties(main_pings, ["meta/clientId",
"meta/submissionDate"])
# In[13]:
main_pings_no_clientid = subset.filter(lambda p: not p.get("meta/clientId", None))
# In[14]:
no_clientid_count = main_pings_no_clientid.count()
no_clientid_count
# Count how many pings per client we're seeing.
# In[15]:
pings_per_client = subset.map(lambda p: ((p.get("meta/clientId"), p.get("meta/submissionDate")), 1)) .aggregateByKey(0, lambda a,b:a+b, lambda a,b:a+b)
ppc_df = pd.DataFrame(pings_per_client.values().collect())
ppc_df.describe()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment