Last active
August 17, 2016 16:27
-
-
Save jdeboer/5831050 to your computer and use it in GitHub Desktop.
The R script used during my presentation at GAUC 2013 to join weather data from BoM with web data from Google Analytics.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Please replace this sample profile ID with that of your own | |
# Google Analytics profile. | |
profile_id <- 123456789 | |
# Please ensure you have all of the following libraries | |
# installed first. | |
library(ganalytics) | |
library(httr) | |
library(XML) | |
library(lubridate) | |
(library(plyr) | |
library(ggplot2) | |
GetWeather <- function(year, month, loc) { | |
ym <- paste0( | |
formatC(year, width = 4, format = "d", flag = "0") , | |
formatC(month, width = 2, format = "d", flag = "0") | |
) | |
url <- paste0( | |
"http://www.bom.gov.au/climate/dwo/", | |
ym, "/html/IDCJDW", loc, ".", ym, ".shtml" | |
) | |
colClasses <- list( | |
date = "factor", | |
day = "factor", | |
temperature.min = "numeric", | |
temperature.max = "numeric", | |
rain = "numeric", | |
evaporation = "numeric", | |
sun = "numeric", | |
gust.direction = "factor", | |
gust.speed = "numeric", | |
gust.time = "character", | |
temerpature.am = "numeric", | |
humidity.am = "numeric", | |
cloud.am = "numeric", | |
wind.direction.am = "factor", | |
wind.speed.am = "numeric", | |
presure.am = "numeric", | |
temperature.pm = "numeric", | |
humidity.pm = "numeric", | |
cloud.pm = "numeric", | |
wind.direction.pm = "factor", | |
wind.speed.pm = "numeric", | |
presure.pm = "numeric" | |
) | |
res <- GET(url) | |
doc <- htmlParse(res) | |
df <- readHTMLTable( | |
doc = doc, | |
colClasses = colClasses, | |
which = 1, | |
stringsAsFactors = FALSE | |
) | |
names(df) <- names(colClasses) | |
return(df) | |
} | |
# Initialise a Google Analytics query for the profile we wish to get website data from. | |
gaQuery <- GaQuery(profile_id) | |
# The location IDs for each city we wish to get weather and website data for. | |
cities <- list( | |
Sydney = 2124, | |
Melbourne = 3050, | |
Brisbane = 4019 | |
) | |
# Set the end date to last day of the previous month as of today. | |
endDate <- floor_date(Sys.Date(), unit = "month") - 1 | |
# Set the start date to beginning of the same month in the previous year. | |
startDate <- endDate | |
year(startDate) <- year(startDate) - 1 | |
startDate <- floor_date(startDate, "month") | |
# Set the Google Analytics query date range using the calculated start and end dates. | |
GaDateRange(gaQuery) <- GaDateRange(startDate, endDate) | |
# Create a table of year and months for querying the BoM website | |
yearMonth <- seq(startDate, endDate, by = "month") | |
yearMonth <- ldply( | |
yearMonth, | |
function(date) { | |
data.frame(y = year(date), m = month(date)) | |
} | |
) | |
########################################### | |
###### Get the Google Analytics data ###### | |
########################################### | |
# Now let's get our Google Analytics data | |
# We only want traffic for the following cities in Australia | |
GaFilter(gaQuery) <- GaAnd( | |
GaExpr("country", "=", "Australia"), | |
GaExpr("city", "~", paste0("^(", paste(names(cities), collapse = "|"), ")$")) | |
) | |
# We would like to know... | |
# the average time on site | |
GaMetrics(gaQuery) <- c("avgTimeOnSite") | |
# split by date and city | |
GaDimensions(gaQuery) <- c("date", "city") | |
# Execute our Google Analytics query to get the data. | |
gaData <- GetGaData(gaQuery) | |
################################### | |
#### Download the weather data #### | |
################################### | |
# For each year and month, and for each city, get the whether data | |
weather <- ddply( | |
yearMonth, | |
.variables = c("y", "m"), | |
.fun = function(yearMonth) { | |
y = yearMonth$y | |
m = yearMonth$m | |
ldply(cities, .fun = function(loc) { | |
suppressWarnings( | |
GetWeather(y, m, loc) | |
) | |
}) | |
}, | |
.progress = "time" | |
) | |
# Clean up the weather data.frame | |
weather <- rename(weather, replace=c(".id" = "city")) | |
weather$city <- factor(weather$city) | |
weather <- mutate(weather, date = ymd(paste(y, m, date, sep = "-"))) | |
weather$y <- NULL | |
weather$m <- NULL | |
weather$day <- NULL | |
weather <- weather[, c("city", "date", "temperature.max")] | |
# Join the Google Analytics and weather data together | |
blog_weather <- join(gaData, weather, by = c("city", "date")) | |
# Remove entries where avgTimeOnSite is zero or missing temperature.max | |
blog_weather <- subset( | |
blog_weather, | |
subset = !(avgTimeOnSite == 0 | is.na(temperature.max)) | |
) | |
# Plot the data | |
qplot(temperature.max, avgTimeOnSite, data = blog_weather) + geom_point(size = 5, alpha = 0.5, position = "jitter") | |
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y") + geom_point(size = 5, alpha = 0.5, position = "jitter") | |
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y", colour = city) geom_point(size = 5, alpha = 0.5, position = "jitter") | |
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y", colour = city, facets = ~city, geom = "blank") + geom_point(size = 3, alpha = 0.5, position = "jitter") + theme(legend.position="none") + geom_smooth(stat="smooth", method="loess") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment