Skip to content

Instantly share code, notes, and snippets.

@pingles
Last active December 17, 2015 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save pingles/5590842 to your computer and use it in GitHub Desktop.
Save pingles/5590842 to your computer and use it in GitHub Desktop.
require(redshift)
conn <- redshift.connect("jdbc:postgresql://mycluster.redshift.amazonaws.com:5439/data", "user", "pass")
# we can retrieve a list of tables
tables <- redshift.tables(conn)
# and get some info about the columns in one of those tables
cols <- redshift.columns(conn, "weblog")
# lets run a simple query, a number of requests (by their status code) grouped by day
statuses_by_day <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, status, COUNT(1) as request_count",
"FROM weblog",
"GROUP BY DATE(time_stamp), status",
"ORDER BY dated"))
statuses_by_day$dated <- as.Date(statuses_by_day$dated)
statuses_by_day$status <- as.factor(statuses_by_day$status)
# lets do a little scatter plot to see how this looks
require(ggplot2)
p <- ggplot(statuses_by_day, aes(x=dated, y=request_count))
p + geom_point(aes(color=status))
# lets see if 200 success requests correlate with the number of
# whatchamajig transactions we make by day
success.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
"FROM weblog",
"WHERE status = 200",
"GROUP BY DATE(time_stamp)",
"ORDER BY dated ASC"))
# next, number of transactions by day too
txns.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as transactions",
"FROM whatchamajig_transactions",
"GROUP BY DATE(time_stamp)",
"ORDER BY dated ASC"))
traffic.txns.byday <- merge(success.requests.byday, txns.byday, by="dated")
traffic.txns.byday$dated <- as.Date(traffic.txns.byday$dated)
# quick plot to see whether it looks realistic?
traffic.txns.p <- ggplot(traffic.txns.byday, aes(x=requests, y=transactions))
traffic.txns.p + geom_point()
# lets use cor.test to test and measure significance
cor.test(traffic.txns.byday$requests, traffic.txns.byday$transactions, alternative="greater")
# interesting, but we may have influenced the data by only including successful
# requests, redirects and errors may also be interesting
# ok, so how about correlation to all requests
requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
"FROM weblog",
"GROUP BY DATE(time_stamp)",
"ORDER BY dated ASC"))
all.traffic.txns <- merge(requests.byday, txns.byday, by="dated")
all.traffic.txns$dated <- as.Date(all.traffic.txns$dated)
all.traffic.txns.p <- ggplot(all.traffic.txns, aes(x=requests, y=transactions))
all.traffic.txns.p + geom_point()
cor.test(all.traffic.txns$requests, all.traffic.txns$transactions, alternative="greater")
# what about traffic to _only_ a specific section... is that stronger?
whatchamajig.requests.byday <- redshift.query(conn, "SELECT DATE(time_stamp) as dated, COUNT(1) as requests",
"FROM weblog",
"WHERE request_path LIKE '%whatchamajig%'",
"GROUP BY DATE(time_stamp)",
"ORDER BY dated ASC"))
whatchamajig.requests.txns <- merge(whatchamajig.requests.byday, txns.byday, by="dated")
whatchamajig.requests.txns$dated <- as.Date(whatchamajig.requests.txns$dated)
# lets look at the plot
whatchamajig.requests.txns.p <- ggplot(whatchamajig.requests.txns, aes(x=requests, y=transactions))
whatchamajig.requests.txns.p + geom_point()
# and one last final correlation test :)
cor.test(whatchamajig.requests.txns$requests, whatchamajig.requests.txns$transactions, alternative="greater")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment