Skip to content

Instantly share code, notes, and snippets.

@HarlanH
Created October 6, 2015 02:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save HarlanH/448ef40d528cdc7c70d0 to your computer and use it in GitHub Desktop.
Save HarlanH/448ef40d528cdc7c70d0 to your computer and use it in GitHub Desktop.
library(dplyr)
library(RPostgreSQL)
library(httr)
library(Lahman)
library(ggplot2)
# connect to the db
con <- src_postgres(dbname="harlan", host="localhost", user="harlan")
# upload the Batting db
data(Batting)
dbWriteTable(con$con, "batting", Batting)
# grab Lee's data and upload it
global_dat <- content(GET("http://ldecola.net/projects/global/global.csv"),
type="text/csv")
dbWriteTable(con$con, "global", global_dat)
# create dplyr table objects
global <- tbl(con, "global")
batting <- tbl(con, "batting")
# windowing
batting %>%
group_by(yearID, lgID) %>%
summarise(runs=sum(R)) %>%
group_by(lgID) %>%
arrange(yearID) %>%
mutate(cumul_runs=cumsum(runs))
# local/remote
system.time(all_remote <- batting %>%
filter(teamID != 'BOS') %>%
group_by(yearID) %>%
mutate(cumul_runs=cumsum(R)) %>%
select(yearID, R, cumul_runs) %>%
collect())
system.time(half_local <- batting %>%
filter(teamID != 'BOS') %>%
select(yearID, R) %>%
collect() %>%
group_by(yearID) %>%
mutate(cumul_runs=cumsum(R)))
# function
tot_runs <- function(dat, year_range=c(2002,2012), ignore_redsox=FALSE) {
ret <- (if (ignore_redsox) filter(dat, teamID != 'BOS') else dat) %>%
filter(yearID >= year_range[[1]] & yearID <= year_range[[2]]) %>%
summarize(tot_runs=sum(R)) %>% collect()
ret[[1]][[1]]
}
# inspecting
qry <- batting %>%
group_by(yearID) %>%
mutate(cumul_runs=cumsum(R)) %>%
select(yearID, R, cumul_runs)
# collapse
qry <-filter(select(batting, yearID, R), yearID > 2010)
qry$query
collapse(qry)$query
# explain
qry <- batting %>%
filter(teamID != 'BOS') %>%
group_by(yearID) %>%
tally() %>%
rename(year=yearID, runs=n) %>%
inner_join(global, by='year') %>%
select(year, runs, celsius) %>%
arrange(year)
# sql literals
filter(batting, sql("\"playerID\" like 'finger%'")) %>% arrange(desc(yearID))
# views
dbGetQuery(con$con,
"CREATE VIEW v_rollie AS SELECT * FROM batting WHERE \"playerID\" = 'fingero01'")
tbl(con, "v_rollie")
tbl(con, sql("select * from v_rollie"))
# indexes
dbGetQuery(con$con,
"CREATE INDEX i_team ON batting (\"teamID\")")
explain(batting %>% filter(teamID == 'BOS'))
# temp tables
compute(filter(batting, playerID == 'fingero01'), name="tmp_rollie")
tbl(con, "tmp_rollie")
mydat <- data.frame(yearID=1973:1985, x=rnorm(n=length(1973:1985)))
copy_to(con, mydat, name="tmp_mydat")
left_join(tbl(con, "tmp_rollie"), tbl(con, "tmp_mydat"), by="yearID") %>% select(yearID, R, x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment