Skip to content

Instantly share code, notes, and snippets.

@Tadge-Analytics
Last active October 31, 2019 03:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tadge-Analytics/1c4dfef73b8fa9cc424be0d7028bdb3a to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/1c4dfef73b8fa9cc424be0d7028bdb3a to your computer and use it in GitHub Desktop.
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(tidyverse)
# data can be downloaded from:
# https://www.dropbox.com/s/odg6udrhahsqhiz/SydneyFuelPriceHistory.zip?dl=1
first_changes <- read_csv("data/SydneyFuelPriceHistory.csv") %>%
mutate(hour_of_day = lubridate::hour(PriceUpdatedDate) + 1,
date = as.Date(PriceUpdatedDate),
date = if_else(hour_of_day == 24, date + 1, date),
hour_of_day = if_else(hour_of_day == 24, 0, hour_of_day)) %>%
# if there are any multi-updates per hour, we only want to take the latest one, for that hour
arrange(desc(PriceUpdatedDate)) %>%
distinct(ServiceStationName, Address, Suburb, Postcode, FuelCode, date, hour_of_day, .keep_all = T) %>%
select(-PriceUpdatedDate, -RecordID) %>%
# we're only really going to care about the most recent year...
# the past is the past, right?
filter(lubridate::year(date) == 2018)
# we now need to get some unique identifiers for the stations
# turns out the same addresses are sometimes entered differently in our data
unique_stations <- first_changes %>%
arrange(ServiceStationName) %>%
distinct(ServiceStationName, Address, Suburb, Postcode, Brand) %>%
mutate(new_ServiceStationName = trimws(tolower(ServiceStationName)),
new_ServiceStationName = str_replace(new_ServiceStationName, " ", " "),
new_ServiceStationName = str_replace(new_ServiceStationName, " - ", "-"),
new_ServiceStationName = str_replace(new_ServiceStationName, "- ", "-")) %>%
mutate(new_suburb = trimws(tolower(Suburb))) %>%
mutate(new_address = trimws(tolower(Address)),
new_address = str_replace(new_address, "rd,", "road,"),
new_address = str_replace(new_address, "st,", "street,"),
new_address = str_replace(new_address, "hwy,", "highway,"),
new_address = str_replace(new_address, " ", " "),
new_address = str_replace(new_address, "- ", "-"),
new_address = str_replace(new_address, " - ", "-")) %>%
mutate(full_name = paste(new_ServiceStationName, new_suburb, Postcode, Brand, sep = "|"))
# At this point, I actually used some nifty "cyclic recoding" techinque,
# to detect whether there were any duplicates left over, after the above corrections...
# There were initially... but not after I filtered for dates >=2018.
# Never the less, you can see how I did this at the following gist.
# https://gist.github.com/Tadge-Analytics/9ce7f6f5333339cd48c1ad4278a08905
# fuel_prices_by_day <-
first_changes %>%
left_join(unique_stations %>% select(-contains("new")),
by = c("ServiceStationName", "Address", "Suburb", "Postcode", "Brand")) %>%
select(full_name, date, hour_of_day, FuelCode, Price) %>%
right_join(
crossing(full_name = unique(unique_stations$full_name),
date = seq.Date(min(first_changes$date), max(first_changes$date), 1),
hour_of_day = 0:23,
FuelCode = unique(first_changes$FuelCode))
, by = c("full_name", "date", "hour_of_day", "FuelCode")) %>%
group_by(full_name, FuelCode) %>%
fill(Price) %>%
filter(!is.na(Price)) %>%
group_by(full_name, date, FuelCode) %>%
summarise(daily_mean_price = mean(Price)) %>%
spread(FuelCode, daily_mean_price) %>%
write_csv("fuel_prices_by_day.csv", na = "")
# unique_stations_tidy_names <-
unique_stations %>%
select(full_name, Postcode, Brand, contains("new")) %>%
distinct(full_name, .keep_all = T) %>%
rename_at(.vars = vars(contains("new")),
.funs = funs(sub("new_", "", .))) %>%
write_csv("unique_stations_neat_parts.csv", na = "")
# let's get the lat longs
# save yourself some time (and using all your google cloud credits)
# by downloading the lat longs, here
# https://www.dropbox.com/s/zf4jcrqm9ayw4cp/googled%20addresses.csv?dl=1
library(ggmap)
register_google(key = "YoUWiLLNeedToUSEUrOWnKey")
# bring in those that have already been searched...
googled_addresses <- read_csv("googled addresses.csv")
# only google those that don't appear in our existing collection
tibble(address = setdiff(unique(unique_stations$new_address), googled_addresses$address)) %>%
head(5) %>% # a safety measure -comment out when running.
mutate(lat_long_data = map(address, ~geocode(.x, output = "latlona", source = "google"))) %>%
unnest(lat_long_data) %>%
bind_rows(googled_addresses) %>%
write_csv("googled addresses.csv", na = "")
# valid_googled addresses <-
read_csv("googled addresses.csv") %>%
filter(!is.na(address1)) %>%
mutate(distance_from_median_point = geosphere::distHaversine(cbind(lon, lat), cbind(median(lon), median(lat)))) %>%
filter(distance_from_median_point < 1.1E6) %>%
select(-distance_from_median_point) %>%
distinct(address, .keep_all = T) %>%
write_csv("valid_googled addresses.csv", na = "")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment