Randy Zwitch randyzwitch

## python-string-interpolation.py
In [1]: print "Here's a string subtitution for my name: %s" %("Randy")

Out[1]: "Here's a string subtitution for my name: Randy"

## r-json-sprintf.R
elements_list = sprintf('{"id":"%s",
                          "top": "%s",
                          "startingWith":"%s",
                          "search":{"type":"%s", "keywords":[%s]}
                          }', element, top, startingWith, searchType, searchKW2)

## r-toJSON-inefficient.R
#Converts report_suites to JSON
if(length(report_suites)>1){
  report_suites <- toJSON(report_suites)
} else {
  report_suites <- toJSON(list(report_suites))
}

#API request
json <- postRequest("ReportSuite.GetTrafficVars",paste('{"rsid_list":', report_suites , '}'))

## r-toJSON-efficient.R
#Efficient method
library(rjson)
report_suites <- list(rsid_list=c("A", "B", "C"))
request.body <- toJSON(report_suites)

#API request
json <- postRequest("ReportSuite.GetTrafficVars", request.body)

## generic-adobe-analytics-api-call.json
{
	"reportDescription":{
		"reportSuiteID":"(string)",
		"date":"(string)",
		"dateFrom":"(string)",
		"dateTo":"(string)",
		"dateGranularity":"(string)",
		"metrics":[
			{
				"id":"(string)"

## udf.py
#Want to try this as Hive UDF
#1. Compile using Jython (never done before)
#2. Re-write using pure Java (don't know Java)
#3. Re-write using Scala (barely done first week of Coursera FP class so far

#Hoping to return this as a single column first
#Eventually want to build a Hive UDTF, to make one row containing dates into multiple rows, one per date

from datetime import *

## hive-predicate-pushdown.sql
--#### Assume sales Hive table partitioned by day_id ####--

--Full Table Scan
select
employees.id,
b.sales
from employees
left join sales on (employees.id = sales.employee_id)
where day_id between '2014-03-01' and '2014-05-31';

## percentile-hive-wrong-way.sql
--Hive expects that you want to calculate your percentiles by account_number and sales
--This code will generate an error about a missing GROUP BY statement
select
account_number,
sales,
CASE WHEN sales > percentile_approx(sales, .9) THEN 1 ELSE 0 END as top10pct_sales
from sales;

## hive-histogram.sql
select
histogram_numeric(salary, 20) as salary_hist
from
sample_08;

--Results
[{"x":23507.68627450983,"y":255.0},{"x":31881.7647058824,"y":340.0},{"x":39824.11498257844,"y":287.0},{"x":47615.58011049725,"y":181.0},{"x":55667.01219512195,"y":164.0},{"x":59952.499999999985,"y":8.0},{"x":66034.67153284674,"y":137.0},{"x":75642.31707317074,"y":82.0},{"x":82496.13636363638,"y":44.0},{"x":91431.66666666667,"y":60.0},{"x":100665.71428571428,"y":21.0},{"x":107326.66666666667,"y":15.0},{"x":121248.74999999999,"y":16.0},{"x":142070.0,"y":2.0},{"x":153896.6666666667,"y":6.0},{"x":162310.0,"y":6.0},{"x":169810.0,"y":2.0},{"x":176740.0,"y":2.0},{"x":193925.0,"y":8.0},{"x":206770.0,"y":2.0}]

## hive-odbc.jl
using ODBC

#Connect to Hadoop cluster via Hive (pre-defined Windows DSN in ODBC Manager)
hiveconn = ODBC.connect("Production hiveserver2"; usr="your-user-name", pwd="your-password-here")

#Clean data, return results directly to file
#Data returned with have origin of flight, flight takeoff, flight landing and elapsed time
hive_query_string =
"select
origin,
	In [1]: print "Here's a string subtitution for my name: %s" %("Randy")

	Out[1]: "Here's a string subtitution for my name: Randy"
	elements_list = sprintf('{"id":"%s",
	"top": "%s",
	"startingWith":"%s",
	"search":{"type":"%s", "keywords":[%s]}
	}', element, top, startingWith, searchType, searchKW2)
	#Converts report_suites to JSON
	if(length(report_suites)>1){
	report_suites <- toJSON(report_suites)
	} else {
	report_suites <- toJSON(list(report_suites))
	}

	#API request
	json <- postRequest("ReportSuite.GetTrafficVars",paste('{"rsid_list":', report_suites , '}'))
	#Efficient method
	library(rjson)
	report_suites <- list(rsid_list=c("A", "B", "C"))
	request.body <- toJSON(report_suites)

	#API request
	json <- postRequest("ReportSuite.GetTrafficVars", request.body)
	{
	"reportDescription":{
	"reportSuiteID":"(string)",
	"date":"(string)",
	"dateFrom":"(string)",
	"dateTo":"(string)",
	"dateGranularity":"(string)",
	"metrics":[
	{
	"id":"(string)"
	#Want to try this as Hive UDF
	#1. Compile using Jython (never done before)
	#2. Re-write using pure Java (don't know Java)
	#3. Re-write using Scala (barely done first week of Coursera FP class so far

	#Hoping to return this as a single column first
	#Eventually want to build a Hive UDTF, to make one row containing dates into multiple rows, one per date

	from datetime import *
	--#### Assume sales Hive table partitioned by day_id ####--

	--Full Table Scan
	select
	employees.id,
	b.sales
	from employees
	left join sales on (employees.id = sales.employee_id)
	where day_id between '2014-03-01' and '2014-05-31';
	--Hive expects that you want to calculate your percentiles by account_number and sales
	--This code will generate an error about a missing GROUP BY statement
	select
	account_number,
	sales,
	CASE WHEN sales > percentile_approx(sales, .9) THEN 1 ELSE 0 END as top10pct_sales
	from sales;
	select
	histogram_numeric(salary, 20) as salary_hist
	from
	sample_08;

	--Results
	[{"x":23507.68627450983,"y":255.0},{"x":31881.7647058824,"y":340.0},{"x":39824.11498257844,"y":287.0},{"x":47615.58011049725,"y":181.0},{"x":55667.01219512195,"y":164.0},{"x":59952.499999999985,"y":8.0},{"x":66034.67153284674,"y":137.0},{"x":75642.31707317074,"y":82.0},{"x":82496.13636363638,"y":44.0},{"x":91431.66666666667,"y":60.0},{"x":100665.71428571428,"y":21.0},{"x":107326.66666666667,"y":15.0},{"x":121248.74999999999,"y":16.0},{"x":142070.0,"y":2.0},{"x":153896.6666666667,"y":6.0},{"x":162310.0,"y":6.0},{"x":169810.0,"y":2.0},{"x":176740.0,"y":2.0},{"x":193925.0,"y":8.0},{"x":206770.0,"y":2.0}]
	using ODBC

	#Connect to Hadoop cluster via Hive (pre-defined Windows DSN in ODBC Manager)
	hiveconn = ODBC.connect("Production hiveserver2"; usr="your-user-name", pwd="your-password-here")

	#Clean data, return results directly to file
	#Data returned with have origin of flight, flight takeoff, flight landing and elapsed time
	hive_query_string =
	"select
	origin,