Saptarshi Guha saptarshiguha

## ravro.avsc
{
    "name"	: "rObject",
    "type"	: "record",
    "namespace" : "org.godhuli.ravro",
    "fields" : [
	{
	    "name" : "data",
	    "type":
		[
		    "null"

## gist:786d67951da6679a1cc5

buildingR <- function(excludeLibs=c(),exclude=NULL,iterate=TRUE,verbose=1,nameof="Rfolder-test",destpath=sprintf("/user/%s/",USER)){
  library(Rhipe)
  rhinit()
  local({
    tfolder <- sprintf("%s/Rdist",tempdir())
    ## delete folder if it exists!
    dir.create(tfolder)
    execu <- if ("package:Rhipe" %in% search()) rhoptions()$RhipeMapReduce else sprintf("/home/%s/software/R_LIBS/Rhipe/bin/RhipeMapReduce",USER)
    ## execu <- if ("package:Rhipe" %in% search()) rhoptions()$Rhipe else sprintf("/home/%s/software/R_LIBS/Rhipe/libs/Rhipe.so",USER)

## Tag Days With Version
tagDaysByVersion <- function(d){
    ##  takes $data$days and returns a modified data$days
    ## with versioninfo attached as a field
    ## i've not done much error checking with this
    ## if you get errors notify me
    if(length(d$data$days)==0) return(d$data$days)
    days <- d$data$days [ order(names(d$data$days)) ]
    dates <- names(days)
    dversion <- rep(NA, length(dates))
    iversion <- NA

## 1177137
{"nbformat_minor": 0, "cells": [{"source": "##Hello's Hello World\nA brief attempt to get data for LOOP_SHARING_STATE_CHANGE and LOOP_TWO_WAY_MEDIA_CONN_LENGTH. See this bug] for more details. You need to go to http://telemetry-dash.mozilla.org/, sign in with Persona and create a Spark Cluster. It will take time to start, but the landing page will have details on how to get to this notebook.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "## Histograms: LOOP_TWO_WAY_MEDIA_CONN_LENGTH_1 and LOOP_SHARING_STATE_CHANGE_1\nLet's f

## gist:de51f79976d2727a524c
{"nbformat_minor": 0, "cells": [{"source": "##Hello's Hello World\nA brief attempt to get data for LOOP_SHARING_STATE_CHANGE and LOOP_TWO_WAY_MEDIA_CONN_LENGTH. See this bug] for more details. You need to go to http://telemetry-dash.mozilla.org/, sign in with Persona and create a Spark Cluster. It will take time to start, but the landing page will have details on how to get to this notebook.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"source": "## Histograms: LOOP_TWO_WAY_MEDIA_CONN_LENGTH_1 and LOOP_SHARING_STATE_CHANGE_1\nLet's f

## gist:ad928472447a5cb20ca0
{"nbformat_minor": 0, "cells": [{"execution_count": 3, "cell_type": "code", "source": "import ujson as json\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport plotly.plotly as py\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n%pylab inline\n## see bug ", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 4, "cell_type": "code", "source": "import datetime\nff39release =  datetime.datetime(2015, 6, 30, 0,0, 1)\ndate_list = [ (ff39release + datetime.timedelta(days=x)).strftime(\"%Y%m%d\") for x in range(0, (datetime.datetime.today()-ff39release).days)]\ndef aggregate_arrays(xs, ys):\n    if xs is None:\n        return ys\n    if ys is None:\n        return xs\n    return xs + ys", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"source": "I do not know

## win10.ipnyb
{"nbformat_minor": 0, "cells": [{"source": "### switching defaults in Windows10", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport boto\nimport binascii\nimport ujson as json\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"scrolled": true, "collapsed": false, "trusted": false}}, {"source": "### Basics", "cell_type": "markdown", "metadata": {}}, {"source": "The goal of this example is to plot the startup distribution for each OS. Let's see how many parallel workers we have at our disposal:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "sc.defaultParallelism", "outputs": [{"execution_count": 2, "output_type": "execute_result", "data": {"text/plain": "240"}, "metad

## win10error.ipnyb
{"nbformat_minor": 0, "cells": [{"source": "### switching defaults in Windows10", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "import ujson as json\nimport boto\nimport binascii\nimport ujson as json\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\n\n%pylab inline", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"scrolled": true, "collapsed": false, "trusted": false}}, {"source": "### Basics", "cell_type": "markdown", "metadata": {}}, {"source": " Let's see how many parallel workers we have at our disposal:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "sc.defaultParallelism", "outputs": [{"execution_count": 2, "output_type": "execute_result", "data": {"text/plain": "240"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": false}}, {"source

## gist:198eb6177097daf09be1
{"nbformat_minor": 0, "cells": [{"source": "### switching defaults in Windows10", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "import ujson as json\nimport boto\nimport binascii\nimport ujson as json\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\noutBucketName = \"s3://sguhaoutputs/win10a/\"\n%pylab inline", "outputs": [], "metadata": {"scrolled": true, "collapsed": false, "trusted": false}}, {"source": "### Basics", "cell_type": "markdown", "metadata": {}}, {"source": " Let's see how many parallel workers we have at our disposal:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "sc.defaultParallelism\nconn = boto.connect_s3()", "outputs": [], "metadata": {"collapsed": false, "trusted": false}}, {"source": "We will need all the pings because we need to stitch", "cell_type": "markdown", "metadata": {}}, {"execution_count": 8, "cell_type": "code", "s

## gist:04331f314f0e3537e19d
{"nbformat_minor": 0, "cells": [{"source": "### switching defaults in Windows10", "cell_type": "markdown", "metadata": {}}, {"execution_count": null, "cell_type": "code", "source": "import ujson as json\nimport boto\nimport binascii\nimport ujson as json\n\nfrom moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history\noutBucketName = \"s3://sguhaoutputs/win10a/\"\n%pylab inline", "outputs": [], "metadata": {"scrolled": true, "collapsed": false, "trusted": false}}, {"source": "### Basics", "cell_type": "markdown", "metadata": {}}, {"source": " Let's see how many parallel workers we have at our disposal:", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "sc.defaultParallelism\nconn = boto.connect_s3()", "outputs": [], "metadata": {"collapsed": false, "trusted": false}}, {"source": "We will need all the pings because we need to stitch", "cell_type": "markdown", "metadata": {}}, {"execution_count": 8, "cell_type": "code", "s
	{
	"name" : "rObject",
	"type" : "record",
	"namespace" : "org.godhuli.ravro",
	"fields" : [
	{
	"name" : "data",
	"type":
	[
	"null"

	buildingR <- function(excludeLibs=c(),exclude=NULL,iterate=TRUE,verbose=1,nameof="Rfolder-test",destpath=sprintf("/user/%s/",USER)){
	library(Rhipe)
	rhinit()
	local({
	tfolder <- sprintf("%s/Rdist",tempdir())
	## delete folder if it exists!
	dir.create(tfolder)
	execu <- if ("package:Rhipe" %in% search()) rhoptions()$RhipeMapReduce else sprintf("/home/%s/software/R_LIBS/Rhipe/bin/RhipeMapReduce",USER)
	## execu <- if ("package:Rhipe" %in% search()) rhoptions()$Rhipe else sprintf("/home/%s/software/R_LIBS/Rhipe/libs/Rhipe.so",USER)
	tagDaysByVersion <- function(d){
	## takes $data$days and returns a modified data$days
	## with versioninfo attached as a field
	## i've not done much error checking with this
	## if you get errors notify me
	if(length(d$data$days)==0) return(d$data$days)
	days <- d$data$days [ order(names(d$data$days)) ]
	dates <- names(days)
	dversion <- rep(NA, length(dates))
	iversion <- NA