View kickstarter_sql_style_guide.md
layout title description tags
default
SQL Style Guide
A guide to writing clean, clear, and consistent SQL.
data
process

Purpose

View metaprogramming_dplyr.r
dataframe %>% parse(text = paste(sapply(dimensions, function(dimension) {
paste0("mutate(", paste0(dimension, "_average = mean(", dimension, "))"))
}), collapse = " %>% "))
View data_frame_indexing_bug.r
# This is a reduction which seems to indicate an issue when
# adding a column using an arbitrary set of indexes.
# First, let's create a data-frame with some random values:
s <- data.frame(x = runif(10), y = runif(10))
# Now, two randomly generated lists of numbers that we'll use to try to index
# This could be created thusly:
# wrong <- sample(1:nrow(s), nrow(s) * 0.8), etc.
wrong <- c(3, 6, 7, 5, 1, 2, 9, 8)
View vim-mode-error.sh
`apm install vim-mode`
/Applications/Atom.app/Contents/Resources/app/apm/node_modules/atom-package-manager/node_modules/keytar/node_modules/bindings/bindings.js:83
throw e
^
Error: Module version mismatch, refusing to load.
at Object.Module._extensions..node (module.js:485:11)
at Module.load (module.js:356:32)
at Function.Module._load (module.js:312:12)
at Module.require (module.js:362:17)
View seconds_since_midnight.sql
-- Note that this doesn't work:
-- SELECT DATEDIFF(second, DATE(NOW()), NOW());
-- =>
-- ERROR: function pg_catalog.date_diff("unknown", date, timestamp with time zone) does not exist
-- HINT: No function matches the given name and argument types. You may need to add explicit type casts.
-- This does work:
SELECT DATEDIFF(second, DATE(NOW()), SPLIT_PART(NOW(), '.', 1)::timestamp);
View example_redshift_query.sql
SELECT *
FROM
(SELECT
month,
amount,
pledge_count,
SUM(1) OVER(PARTITION BY month ORDER BY pledge_count DESC ROWS UNBOUNDED PRECEDING) as row
FROM
(SELECT
TO_CHAR(CONVERT_TIMEZONE('UTC', 'America/New_York', backings.pledged_at), 'YYYY-MM-01') as month,
View redshift_credentials.r
# Install the Redshift R library:
# https://github.com/pingles/redshift-r
# install.packages("~/Downloads/redshift-r-master", dependencies = T, repos = NULL, type = "source")
library(redshift)
redshift <- redshift.connect("jdbc:postgresql://REDSHIFT_DB:5439/DB_NAME", "LOGIN", "PASSWORD")
# Example Query:
data <- dbGetQuery(redshift, "SELECT COUNT(*) FROM table")
View citi_bike_share.r
library(ggplot2)
library(rjson)
# Strip out enclosing object so its just an array of stations before importing into R.
# e.g. data should be of the form:
# [
# {"id":72,"stationName":"W 52 St & 11 Av","availableDocks":14,"totalDocks":39,"latitude":40.76727216,"longitude":-73.99392888,"statusValue":"In Service","statusKey":1,"availableBikes":21,"stAddress1":"W 52 St & 11 Av","stAddress2":"","city":"","postalCode":"","location":"","altitude":"","testStation":false,"lastCommunicationTime":null,"landMark":""},
# ...
# ]
#
View miserables.json
{
"nodes":[
{"name":"Myriel","group":1},
{"name":"Napoleon","group":1},
{"name":"Mlle.Baptistine","group":1},
{"name":"Mme.Magloire","group":1},
{"name":"CountessdeLo","group":1},
{"name":"Geborand","group":1},
{"name":"Champtercier","group":1},
{"name":"Cravatte","group":1},
View gist:1619331
#Sopa data munging:
sopa.d <- read.csv("~/Downloads/Kickstarter Backer Report - $15 reward - Jan 15 6pm.csv")
sopa.i <- read.csv("~/Downloads/Kickstarter Backer Report - $30 reward - Jan 15 6pm.csv")
#International survey didn't have the pickup question so fake a column:
sopa.i$Choices.2 <- sopa.i$Choices.1
sopa.i$Choices.1 <- "No"
sopa <- rbind(sopa.d, sopa.i)
# Rename Levels: