chutten/first stacks.ipynb Secret

## first stacks.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              first stacks.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## first stacks.py

# coding: utf-8

# ### When a user crashes for the first time, what is that crash?

# In[1]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import requests

from plotly.graph_objs import *
from moztelemetry import get_pings_properties, get_one_ping_per_client
from moztelemetry.dataset import Dataset

get_ipython().magic(u'matplotlib inline')


# In[2]:

sc.defaultParallelism


# In[3]:

pings = Dataset.from_source("telemetry")     .where(docType='crash')     .where(appUpdateChannel=lambda c: c in ('nightly'))    .where(submissionDate=lambda d: d >= "20170101" and d <= "20170112")     .records(sc, sample=1)


# In[4]:

subset = get_pings_properties(pings, [
        "clientId",
        "submissionDate",
        "payload/stackTraces",
        "environment/profile/creationDate"])\
    .filter(lambda p: p["payload/stackTraces"] is not None)


# In[5]:

subset.count()


# First we need to filter out any profile older than our survey period.

# In[6]:

import datetime

survey_begin = (datetime.datetime(2017, 1, 1) - datetime.datetime(1970, 1, 1)).days
subset = subset.filter(lambda p: p["environment/profile/creationDate"] >= survey_begin)


# In[7]:

subset.count()


# So now we have a survey subset of pings with stack information. We want to get the first crash for each client reporting a crash, so...

# In[8]:

firsts = subset    .map(lambda p: (p["clientId"], p))    .reduceByKey(lambda a, b: a if a["submissionDate"] < b["submissionDate"] else b)    .map(lambda pair: pair[1])


# In[9]:

firsts.count()


# In[10]:

def symbolicate(s):
    data = json.dumps({
            'stacks': [[[f['module_index'], int(f['ip'], 16) - int(s['modules'][f['module_index']]['base_addr'], 16)] for f in s['threads'][s['crash_info']['crashing_thread']]['frames']]],
            'memoryMap': [[m['debug_file'].translate(dict((ord(char), None) for char in ' ()')), m['debug_id']] for m in s['modules']], 'version': 4})
    result = requests.post('http://symbolapi.mozilla.org/', data=data)
    result_json = result.json()
    return result_json['symbolicatedStacks']


# In[11]:

def safe_symbolicate(s):
    try:
        return symbolicate(s)
    except Exception, e:
        return e


# In[12]:

symbolicated_firsts = firsts.map(lambda p: (p["environment/profile/creationDate"], safe_symbolicate(p["payload/stackTraces"])))


# In[13]:

symbolicated_firsts.take(3)


# In[18]:

stack_counts = symbolicated_firsts.map(lambda pair: (str(pair[1]), 1)).countByKey()


# In[19]:

import operator
sorted_stack_counts = sorted(stack_counts.items(), key=operator.itemgetter(1), reverse=True)


# In[20]:

sorted_stack_counts[:10]


# Seeing a lot of shutdown crashes (RunWatchdog) which is exactly what I'd expect. Nice validation there.
#
# Things to do:
# * figure out where `'Expecting value: line 1 column 1 (char 0)'` is coming from. It's from a json lib somewhere, but is it the request or the response or does it come from the server somehow?
# * Determine if there's anything we can do with frames missing a module index (JIT frames, maybe?).

	# coding: utf-8

	# ### When a user crashes for the first time, what is that crash?

	# In[1]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	import requests

	from plotly.graph_objs import *
	from moztelemetry import get_pings_properties, get_one_ping_per_client
	from moztelemetry.dataset import Dataset

	get_ipython().magic(u'matplotlib inline')


	# In[2]:

	sc.defaultParallelism


	# In[3]:

	pings = Dataset.from_source("telemetry") .where(docType='crash') .where(appUpdateChannel=lambda c: c in ('nightly')) .where(submissionDate=lambda d: d >= "20170101" and d <= "20170112") .records(sc, sample=1)


	# In[4]:

	subset = get_pings_properties(pings, [
	"clientId",
	"submissionDate",
	"payload/stackTraces",
	"environment/profile/creationDate"])\
	.filter(lambda p: p["payload/stackTraces"] is not None)


	# In[5]:

	subset.count()


	# First we need to filter out any profile older than our survey period.

	# In[6]:

	import datetime

	survey_begin = (datetime.datetime(2017, 1, 1) - datetime.datetime(1970, 1, 1)).days
	subset = subset.filter(lambda p: p["environment/profile/creationDate"] >= survey_begin)


	# In[7]:

	subset.count()


	# So now we have a survey subset of pings with stack information. We want to get the first crash for each client reporting a crash, so...

	# In[8]:

	firsts = subset .map(lambda p: (p["clientId"], p)) .reduceByKey(lambda a, b: a if a["submissionDate"] < b["submissionDate"] else b) .map(lambda pair: pair[1])


	# In[9]:

	firsts.count()


	# In[10]:

	def symbolicate(s):
	data = json.dumps({
	'stacks': [[[f['module_index'], int(f['ip'], 16) - int(s['modules'][f['module_index']]['base_addr'], 16)] for f in s['threads'][s['crash_info']['crashing_thread']]['frames']]],
	'memoryMap': [[m['debug_file'].translate(dict((ord(char), None) for char in ' ()')), m['debug_id']] for m in s['modules']], 'version': 4})
	result = requests.post('http://symbolapi.mozilla.org/', data=data)
	result_json = result.json()
	return result_json['symbolicatedStacks']


	# In[11]:

	def safe_symbolicate(s):
	try:
	return symbolicate(s)
	except Exception, e:
	return e


	# In[12]:

	symbolicated_firsts = firsts.map(lambda p: (p["environment/profile/creationDate"], safe_symbolicate(p["payload/stackTraces"])))


	# In[13]:

	symbolicated_firsts.take(3)


	# In[18]:

	stack_counts = symbolicated_firsts.map(lambda pair: (str(pair[1]), 1)).countByKey()


	# In[19]:

	import operator
	sorted_stack_counts = sorted(stack_counts.items(), key=operator.itemgetter(1), reverse=True)


	# In[20]:

	sorted_stack_counts[:10]


	# Seeing a lot of shutdown crashes (RunWatchdog) which is exactly what I'd expect. Nice validation there.
	#
	# Things to do:
	# * figure out where `'Expecting value: line 1 column 1 (char 0)'` is coming from. It's from a json lib somewhere, but is it the request or the response or does it come from the server somehow?
	# * Determine if there's anything we can do with frames missing a module index (JIT frames, maybe?).