Last active
July 7, 2016 14:31
-
-
Save Dexterp37/f1e7ebc9d214b14bf4c154d0589c290c to your computer and use it in GitHub Desktop.
[1274975] missing clientId for main-pings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[3]: | |
import datetime as dt | |
import matplotlib.pyplot as plt | |
import ujson as json | |
import pandas as pd | |
import numpy as np | |
import copy as cp | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history | |
# ### How many main pings with no client id the week before 1233986 landed? | |
# In[4]: | |
build_ids = ("20160621000000", "20160628999999") | |
submission_dates = ("20160622", "20160628") | |
main_pings = get_pings(sc, | |
app="Firefox", | |
channel="nightly", | |
build_id=build_ids, | |
submission_date=submission_dates, | |
doc_type="main", | |
schema="v4", | |
fraction=1.0) | |
# In[5]: | |
main_pings_count = main_pings.count() | |
main_pings_count | |
# In[6]: | |
subset = get_pings_properties(main_pings, ["meta/clientId", | |
"meta/submissionDate"]) | |
# In[7]: | |
main_pings_no_clientid = subset.filter(lambda p: not p.get("meta/clientId", None)) | |
# In[8]: | |
no_clientid_count = main_pings_no_clientid.count() | |
no_clientid_count | |
# In[9]: | |
pings_per_client = subset.map(lambda p: ((p.get("meta/clientId"), p.get("meta/submissionDate")), 1)) .aggregateByKey(0, lambda a,b:a+b, lambda a,b:a+b) | |
ppc_df = pd.DataFrame(pings_per_client.values().collect()) | |
ppc_df.describe() | |
# ### How many main pings with no client id the after 1233986 landed? | |
# In[10]: | |
build_ids = ("20160629000000", "20160705030222") | |
submission_dates = ("20160629", "20160706") | |
main_pings = get_pings(sc, | |
app="Firefox", | |
channel="nightly", | |
build_id=build_ids, | |
submission_date=submission_dates, | |
doc_type="main", | |
schema="v4", | |
fraction=1.0) | |
# In[11]: | |
main_pings_count = main_pings.count() | |
main_pings_count | |
# In[12]: | |
subset = get_pings_properties(main_pings, ["meta/clientId", | |
"meta/submissionDate"]) | |
# In[13]: | |
main_pings_no_clientid = subset.filter(lambda p: not p.get("meta/clientId", None)) | |
# In[14]: | |
no_clientid_count = main_pings_no_clientid.count() | |
no_clientid_count | |
# Count how many pings per client we're seeing. | |
# In[15]: | |
pings_per_client = subset.map(lambda p: ((p.get("meta/clientId"), p.get("meta/submissionDate")), 1)) .aggregateByKey(0, lambda a,b:a+b, lambda a,b:a+b) | |
ppc_df = pd.DataFrame(pings_per_client.values().collect()) | |
ppc_df.describe() | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment