Scott Hoover githoov

## chat_sentiment_analysis.py
# preliminaries
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.porter import PorterStemmer
from nltk import tokenize, pos_tag
import csv
import re

# read in data
comments = csv.reader(open("/Users/scott/Downloads/issue_comments.csv", "rb"))

## benchmarks.sql
/*
${TABLE} is a placeholder for the table variants
you will be benchmarking with this script. Leave
${TABLE} in, as it will be used to dynamical substitute
your table names at the command line.
*/

\timing on
\o /dev/null

## create_tables.sql
/*
  this first set of tables explicitly defines
  foreign-to-primary-key relationships
  in addition to shared distribution key
 */
drop table if exists public.event_rel;
create table public.event_rel (
  created_at      timestamp       encode raw
  , event_type    varchar(200)    encode text32k
  , license_slug  varchar(18)     encode raw

## logParser.scala
/*
  This is a Spark Streaming job
  that takes a raw stream of logs
  from Flume, parses the log lines
  capturing them in an RDD, then
  adds a schema and ultimately writes
  to HDFS.

  Written by Scott Hoover, 2016.
  Send questions to scott@looker.com

## pinger_benchmarks_compare.R
# preliminaries
library(ggplot2)

# read in file
df <- read.csv(file = "~/bench_summary.txt", header = FALSE)

# generate query numbers and merge with data frame
df <- cbind(df, rep(rep(c(1:5),each = 5), 5))

# rename columns

## pinger_benchmarks_queries.sql
\timing on
\o /dev/null

select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;

select looker_instance_slug, count(*) from pinger.event_tmp1 group by 1;

## pinger_benchmarks_drop.sql
drop table if exists pinger.event_tmp1;
drop table if exists pinger.event_tmp2;
drop table if exists pinger.event_tmp3;
drop table if exists pinger.event_tmp4;
drop table if exists pinger.event_tmp5;

## pinger_benchmarks_insert.sql
insert into pinger.event_tmp1 (
select * from pinger.event
);

insert into pinger.event_tmp2 (
select * from pinger.event
);

insert into pinger.event_tmp3 (
select * from pinger.event

## pinger_benchmarks_create.sql
/* simple sort */
create table pinger.event_tmp1 (
	event_id bigint encode mostly16
	, agent character varying(65535) encode lzo
	, referrer character varying(65535) encode lzo
	, user_id integer encode mostly8
	, version character varying(100) encode bytedict
	, created_at timestamp without time zone encode delta
	, ip_address character varying(100) encode lzo
	, event_type character varying(200) encode bytedict

## pickup_times.R
# query: https://metanew.looker.com/x/HBhS99t
# median pickup time
df <- read.csv(file = "~/Downloads/pickups.csv", header = TRUE)
names(df) <- c("id", "month", "pickup_time")
ddply(df, .(month), summarize, median_duration = median(pickup_time, na.rm = TRUE))
	# preliminaries
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	from nltk.stem.porter import PorterStemmer
	from nltk import tokenize, pos_tag
	import csv
	import re

	# read in data
	comments = csv.reader(open("/Users/scott/Downloads/issue_comments.csv", "rb"))
	/*
	${TABLE} is a placeholder for the table variants
	you will be benchmarking with this script. Leave
	${TABLE} in, as it will be used to dynamical substitute
	your table names at the command line.
	*/

	\timing on
	\o /dev/null
	/*
	this first set of tables explicitly defines
	foreign-to-primary-key relationships
	in addition to shared distribution key
	*/
	drop table if exists public.event_rel;
	create table public.event_rel (
	created_at timestamp encode raw
	, event_type varchar(200) encode text32k
	, license_slug varchar(18) encode raw
	/*
	This is a Spark Streaming job
	that takes a raw stream of logs
	from Flume, parses the log lines
	capturing them in an RDD, then
	adds a schema and ultimately writes
	to HDFS.

	Written by Scott Hoover, 2016.
	Send questions to scott@looker.com
	# preliminaries
	library(ggplot2)

	# read in file
	df <- read.csv(file = "~/bench_summary.txt", header = FALSE)

	# generate query numbers and merge with data frame
	df <- cbind(df, rep(rep(c(1:5),each = 5), 5))

	# rename columns
	\timing on
	\o /dev/null

	select count(*) from pinger.event_tmp1;
	select count(*) from pinger.event_tmp1;
	select count(*) from pinger.event_tmp1;
	select count(*) from pinger.event_tmp1;
	select count(*) from pinger.event_tmp1;

	select looker_instance_slug, count(*) from pinger.event_tmp1 group by 1;
	drop table if exists pinger.event_tmp1;
	drop table if exists pinger.event_tmp2;
	drop table if exists pinger.event_tmp3;
	drop table if exists pinger.event_tmp4;
	drop table if exists pinger.event_tmp5;
	insert into pinger.event_tmp1 (
	select * from pinger.event
	);

	insert into pinger.event_tmp2 (
	select * from pinger.event
	);

	insert into pinger.event_tmp3 (
	select * from pinger.event
	/* simple sort */
	create table pinger.event_tmp1 (
	event_id bigint encode mostly16
	, agent character varying(65535) encode lzo
	, referrer character varying(65535) encode lzo
	, user_id integer encode mostly8
	, version character varying(100) encode bytedict
	, created_at timestamp without time zone encode delta
	, ip_address character varying(100) encode lzo
	, event_type character varying(200) encode bytedict
	# query: https://metanew.looker.com/x/HBhS99t
	# median pickup time
	df <- read.csv(file = "~/Downloads/pickups.csv", header = TRUE)
	names(df) <- c("id", "month", "pickup_time")
	ddply(df, .(month), summarize, median_duration = median(pickup_time, na.rm = TRUE))