Skip to content

Instantly share code, notes, and snippets.

View githoov's full-sized avatar

Scott Hoover githoov

View GitHub Profile
@githoov
githoov / scalikejdbc_adhoc.scala
Last active June 2, 2016 07:42
Ad Hoc Querying with ScalikeJDBC
/*
in app/models/Query.scala
*/
package models
import scalikejdbc._
import play.api.libs.json._
import play.api.libs.json.Json
import play.api.libs.json.util._
@githoov
githoov / chat_sentiment_analysis.py
Created May 25, 2016 22:31
Analysis of Chat Text
# preliminaries
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.porter import PorterStemmer
from nltk import tokenize, pos_tag
import csv
import re
# read in data
comments = csv.reader(open("/Users/scott/Downloads/issue_comments.csv", "rb"))
@githoov
githoov / benchmarks.sql
Last active May 21, 2016 16:54
Redshift Benchmark Queries
/*
${TABLE} is a placeholder for the table variants
you will be benchmarking with this script. Leave
${TABLE} in, as it will be used to dynamical substitute
your table names at the command line.
*/
\timing on
\o /dev/null
@githoov
githoov / create_tables.sql
Created May 21, 2016 16:27
Redshift Table Architectures
/*
this first set of tables explicitly defines
foreign-to-primary-key relationships
in addition to shared distribution key
*/
drop table if exists public.event_rel;
create table public.event_rel (
created_at timestamp encode raw
, event_type varchar(200) encode text32k
, license_slug varchar(18) encode raw
@githoov
githoov / logParser.scala
Created May 4, 2016 22:07
Spark Streaming job for parsing logs
/*
This is a Spark Streaming job
that takes a raw stream of logs
from Flume, parses the log lines
capturing them in an RDD, then
adds a schema and ultimately writes
to HDFS.
Written by Scott Hoover, 2016.
Send questions to scott@looker.com
@githoov
githoov / pinger_benchmarks_compare.R
Last active April 23, 2016 00:55
Pinger Benchmarks: R Comparison
# preliminaries
library(ggplot2)
# read in file
df <- read.csv(file = "~/bench_summary.txt", header = FALSE)
# generate query numbers and merge with data frame
df <- cbind(df, rep(rep(c(1:5),each = 5), 5))
# rename columns
@githoov
githoov / pinger_benchmarks_queries.sql
Last active April 23, 2016 00:29
Pinger Benchmarks: Queries
\timing on
\o /dev/null
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select count(*) from pinger.event_tmp1;
select looker_instance_slug, count(*) from pinger.event_tmp1 group by 1;
@githoov
githoov / pinger_benchmarks_drop.sql
Created April 23, 2016 00:23
Pinger Benchmarks: Drop Tables
drop table if exists pinger.event_tmp1;
drop table if exists pinger.event_tmp2;
drop table if exists pinger.event_tmp3;
drop table if exists pinger.event_tmp4;
drop table if exists pinger.event_tmp5;
@githoov
githoov / pinger_benchmarks_insert.sql
Created April 23, 2016 00:20
Pinger Benchmarks: Insert Statements
insert into pinger.event_tmp1 (
select * from pinger.event
);
insert into pinger.event_tmp2 (
select * from pinger.event
);
insert into pinger.event_tmp3 (
select * from pinger.event
@githoov
githoov / pinger_benchmarks_create.sql
Created April 23, 2016 00:19
Pinger Benchmarks: Create Table Statements
/* simple sort */
create table pinger.event_tmp1 (
event_id bigint encode mostly16
, agent character varying(65535) encode lzo
, referrer character varying(65535) encode lzo
, user_id integer encode mostly8
, version character varying(100) encode bytedict
, created_at timestamp without time zone encode delta
, ip_address character varying(100) encode lzo
, event_type character varying(200) encode bytedict