Skip to content

Instantly share code, notes, and snippets.

@jdeboer
Last active August 17, 2016 16:27
Show Gist options
  • Save jdeboer/5831050 to your computer and use it in GitHub Desktop.
Save jdeboer/5831050 to your computer and use it in GitHub Desktop.
The R script used during my presentation at GAUC 2013 to join weather data from BoM with web data from Google Analytics.
# Please replace this sample profile ID with that of your own
# Google Analytics profile.
profile_id <- 123456789
# Please ensure you have all of the following libraries
# installed first.
library(ganalytics)
library(httr)
library(XML)
library(lubridate)
(library(plyr)
library(ggplot2)
GetWeather <- function(year, month, loc) {
ym <- paste0(
formatC(year, width = 4, format = "d", flag = "0") ,
formatC(month, width = 2, format = "d", flag = "0")
)
url <- paste0(
"http://www.bom.gov.au/climate/dwo/",
ym, "/html/IDCJDW", loc, ".", ym, ".shtml"
)
colClasses <- list(
date = "factor",
day = "factor",
temperature.min = "numeric",
temperature.max = "numeric",
rain = "numeric",
evaporation = "numeric",
sun = "numeric",
gust.direction = "factor",
gust.speed = "numeric",
gust.time = "character",
temerpature.am = "numeric",
humidity.am = "numeric",
cloud.am = "numeric",
wind.direction.am = "factor",
wind.speed.am = "numeric",
presure.am = "numeric",
temperature.pm = "numeric",
humidity.pm = "numeric",
cloud.pm = "numeric",
wind.direction.pm = "factor",
wind.speed.pm = "numeric",
presure.pm = "numeric"
)
res <- GET(url)
doc <- htmlParse(res)
df <- readHTMLTable(
doc = doc,
colClasses = colClasses,
which = 1,
stringsAsFactors = FALSE
)
names(df) <- names(colClasses)
return(df)
}
# Initialise a Google Analytics query for the profile we wish to get website data from.
gaQuery <- GaQuery(profile_id)
# The location IDs for each city we wish to get weather and website data for.
cities <- list(
Sydney = 2124,
Melbourne = 3050,
Brisbane = 4019
)
# Set the end date to last day of the previous month as of today.
endDate <- floor_date(Sys.Date(), unit = "month") - 1
# Set the start date to beginning of the same month in the previous year.
startDate <- endDate
year(startDate) <- year(startDate) - 1
startDate <- floor_date(startDate, "month")
# Set the Google Analytics query date range using the calculated start and end dates.
GaDateRange(gaQuery) <- GaDateRange(startDate, endDate)
# Create a table of year and months for querying the BoM website
yearMonth <- seq(startDate, endDate, by = "month")
yearMonth <- ldply(
yearMonth,
function(date) {
data.frame(y = year(date), m = month(date))
}
)
###########################################
###### Get the Google Analytics data ######
###########################################
# Now let's get our Google Analytics data
# We only want traffic for the following cities in Australia
GaFilter(gaQuery) <- GaAnd(
GaExpr("country", "=", "Australia"),
GaExpr("city", "~", paste0("^(", paste(names(cities), collapse = "|"), ")$"))
)
# We would like to know...
# the average time on site
GaMetrics(gaQuery) <- c("avgTimeOnSite")
# split by date and city
GaDimensions(gaQuery) <- c("date", "city")
# Execute our Google Analytics query to get the data.
gaData <- GetGaData(gaQuery)
###################################
#### Download the weather data ####
###################################
# For each year and month, and for each city, get the whether data
weather <- ddply(
yearMonth,
.variables = c("y", "m"),
.fun = function(yearMonth) {
y = yearMonth$y
m = yearMonth$m
ldply(cities, .fun = function(loc) {
suppressWarnings(
GetWeather(y, m, loc)
)
})
},
.progress = "time"
)
# Clean up the weather data.frame
weather <- rename(weather, replace=c(".id" = "city"))
weather$city <- factor(weather$city)
weather <- mutate(weather, date = ymd(paste(y, m, date, sep = "-")))
weather$y <- NULL
weather$m <- NULL
weather$day <- NULL
weather <- weather[, c("city", "date", "temperature.max")]
# Join the Google Analytics and weather data together
blog_weather <- join(gaData, weather, by = c("city", "date"))
# Remove entries where avgTimeOnSite is zero or missing temperature.max
blog_weather <- subset(
blog_weather,
subset = !(avgTimeOnSite == 0 | is.na(temperature.max))
)
# Plot the data
qplot(temperature.max, avgTimeOnSite, data = blog_weather) + geom_point(size = 5, alpha = 0.5, position = "jitter")
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y") + geom_point(size = 5, alpha = 0.5, position = "jitter")
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y", colour = city) geom_point(size = 5, alpha = 0.5, position = "jitter")
qplot(temperature.max, avgTimeOnSite, data = blog_weather, log="y", colour = city, facets = ~city, geom = "blank") + geom_point(size = 3, alpha = 0.5, position = "jitter") + theme(legend.position="none") + geom_smooth(stat="smooth", method="loess")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment