Tadge-Analytics/NSW fuel data wrangle

## NSW fuel data wrangle
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(tidyverse)


# data can be downloaded from:
# https://www.dropbox.com/s/odg6udrhahsqhiz/SydneyFuelPriceHistory.zip?dl=1


first_changes <- read_csv("data/SydneyFuelPriceHistory.csv") %>%
  mutate(hour_of_day = lubridate::hour(PriceUpdatedDate) + 1,
         date = as.Date(PriceUpdatedDate),
         date = if_else(hour_of_day == 24, date + 1, date),
         hour_of_day = if_else(hour_of_day == 24, 0, hour_of_day)) %>%

# if there are any multi-updates per hour, we only want to take the latest one, for that hour

  arrange(desc(PriceUpdatedDate)) %>%
  distinct(ServiceStationName, Address, Suburb, Postcode, FuelCode, date, hour_of_day, .keep_all = T) %>%
  select(-PriceUpdatedDate, -RecordID) %>%

# we're only really going to care about the most recent year...
# the past is the past, right?

  filter(lubridate::year(date) == 2018)


# we now need to get some unique identifiers for the stations
# turns out the same addresses are sometimes entered differently in our data


unique_stations <- first_changes %>%
  arrange(ServiceStationName) %>%
  distinct(ServiceStationName, Address, Suburb, Postcode, Brand) %>%
  mutate(new_ServiceStationName = trimws(tolower(ServiceStationName)),
         new_ServiceStationName = str_replace(new_ServiceStationName, "  ", " "),
         new_ServiceStationName = str_replace(new_ServiceStationName, " - ", "-"),
         new_ServiceStationName = str_replace(new_ServiceStationName, "- ", "-")) %>%
  mutate(new_suburb = trimws(tolower(Suburb))) %>%
  mutate(new_address = trimws(tolower(Address)),
         new_address = str_replace(new_address, "rd,", "road,"),
         new_address = str_replace(new_address, "st,", "street,"),
         new_address = str_replace(new_address, "hwy,", "highway,"),
         new_address = str_replace(new_address, "  ", " "),
         new_address = str_replace(new_address, "- ", "-"),
         new_address = str_replace(new_address, " - ", "-")) %>%
    mutate(full_name = paste(new_ServiceStationName, new_suburb, Postcode, Brand, sep = "|"))


# At this point, I actually used some nifty "cyclic recoding" techinque,
# to detect whether there were any duplicates left over, after the above corrections...
# There were initially... but not after I filtered for dates >=2018.
# Never the less, you can see how I did this at the following gist.
# https://gist.github.com/Tadge-Analytics/9ce7f6f5333339cd48c1ad4278a08905


# fuel_prices_by_day <-
first_changes %>%
  left_join(unique_stations %>% select(-contains("new")),
            by = c("ServiceStationName", "Address", "Suburb", "Postcode", "Brand")) %>%
  select(full_name, date, hour_of_day, FuelCode, Price) %>%

  right_join(

    crossing(full_name = unique(unique_stations$full_name),
             date = seq.Date(min(first_changes$date), max(first_changes$date), 1),
             hour_of_day = 0:23,
             FuelCode = unique(first_changes$FuelCode))

    , by = c("full_name", "date", "hour_of_day", "FuelCode")) %>%

  group_by(full_name, FuelCode) %>%
  fill(Price) %>%
  filter(!is.na(Price)) %>%
  group_by(full_name, date, FuelCode) %>%
  summarise(daily_mean_price = mean(Price)) %>%
  spread(FuelCode, daily_mean_price) %>%
  write_csv("fuel_prices_by_day.csv", na = "")


# unique_stations_tidy_names <-
unique_stations %>%
  select(full_name, Postcode, Brand, contains("new")) %>%
  distinct(full_name, .keep_all = T) %>%
  rename_at(.vars = vars(contains("new")),
            .funs = funs(sub("new_", "", .))) %>%
  write_csv("unique_stations_neat_parts.csv", na = "")


# let's get the lat longs

# save yourself some time (and using all your google cloud credits)
# by downloading the lat longs, here
# https://www.dropbox.com/s/zf4jcrqm9ayw4cp/googled%20addresses.csv?dl=1


library(ggmap)
register_google(key = "YoUWiLLNeedToUSEUrOWnKey")

# bring in those that have already been searched...
googled_addresses <- read_csv("googled addresses.csv")

# only google those that don't appear in our existing collection
tibble(address = setdiff(unique(unique_stations$new_address), googled_addresses$address)) %>%
  head(5) %>% # a safety measure -comment out when running.
  mutate(lat_long_data = map(address, ~geocode(.x, output = "latlona", source = "google"))) %>%
  unnest(lat_long_data) %>%
  bind_rows(googled_addresses) %>%
  write_csv("googled addresses.csv", na = "")


# valid_googled addresses <-
read_csv("googled addresses.csv") %>%
  filter(!is.na(address1)) %>%
  mutate(distance_from_median_point = geosphere::distHaversine(cbind(lon, lat), cbind(median(lon), median(lat)))) %>%
  filter(distance_from_median_point < 1.1E6) %>%
  select(-distance_from_median_point) %>%
  distinct(address, .keep_all = T) %>%
  write_csv("valid_googled addresses.csv", na = "")
	setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
	library(tidyverse)


	# data can be downloaded from:
	# https://www.dropbox.com/s/odg6udrhahsqhiz/SydneyFuelPriceHistory.zip?dl=1



	first_changes <- read_csv("data/SydneyFuelPriceHistory.csv") %>%
	mutate(hour_of_day = lubridate::hour(PriceUpdatedDate) + 1,
	date = as.Date(PriceUpdatedDate),
	date = if_else(hour_of_day == 24, date + 1, date),
	hour_of_day = if_else(hour_of_day == 24, 0, hour_of_day)) %>%

	# if there are any multi-updates per hour, we only want to take the latest one, for that hour

	arrange(desc(PriceUpdatedDate)) %>%
	distinct(ServiceStationName, Address, Suburb, Postcode, FuelCode, date, hour_of_day, .keep_all = T) %>%
	select(-PriceUpdatedDate, -RecordID) %>%

	# we're only really going to care about the most recent year...
	# the past is the past, right?

	filter(lubridate::year(date) == 2018)



	# we now need to get some unique identifiers for the stations
	# turns out the same addresses are sometimes entered differently in our data


	unique_stations <- first_changes %>%
	arrange(ServiceStationName) %>%
	distinct(ServiceStationName, Address, Suburb, Postcode, Brand) %>%
	mutate(new_ServiceStationName = trimws(tolower(ServiceStationName)),
	new_ServiceStationName = str_replace(new_ServiceStationName, " ", " "),
	new_ServiceStationName = str_replace(new_ServiceStationName, " - ", "-"),
	new_ServiceStationName = str_replace(new_ServiceStationName, "- ", "-")) %>%
	mutate(new_suburb = trimws(tolower(Suburb))) %>%
	mutate(new_address = trimws(tolower(Address)),
	new_address = str_replace(new_address, "rd,", "road,"),
	new_address = str_replace(new_address, "st,", "street,"),
	new_address = str_replace(new_address, "hwy,", "highway,"),
	new_address = str_replace(new_address, " ", " "),
	new_address = str_replace(new_address, "- ", "-"),
	new_address = str_replace(new_address, " - ", "-")) %>%
	mutate(full_name = paste(new_ServiceStationName, new_suburb, Postcode, Brand, sep = "\|"))


	# At this point, I actually used some nifty "cyclic recoding" techinque,
	# to detect whether there were any duplicates left over, after the above corrections...
	# There were initially... but not after I filtered for dates >=2018.
	# Never the less, you can see how I did this at the following gist.
	# https://gist.github.com/Tadge-Analytics/9ce7f6f5333339cd48c1ad4278a08905




	# fuel_prices_by_day <-
	first_changes %>%
	left_join(unique_stations %>% select(-contains("new")),
	by = c("ServiceStationName", "Address", "Suburb", "Postcode", "Brand")) %>%
	select(full_name, date, hour_of_day, FuelCode, Price) %>%

	right_join(

	crossing(full_name = unique(unique_stations$full_name),
	date = seq.Date(min(first_changes$date), max(first_changes$date), 1),
	hour_of_day = 0:23,
	FuelCode = unique(first_changes$FuelCode))

	, by = c("full_name", "date", "hour_of_day", "FuelCode")) %>%

	group_by(full_name, FuelCode) %>%
	fill(Price) %>%
	filter(!is.na(Price)) %>%
	group_by(full_name, date, FuelCode) %>%
	summarise(daily_mean_price = mean(Price)) %>%
	spread(FuelCode, daily_mean_price) %>%
	write_csv("fuel_prices_by_day.csv", na = "")



	# unique_stations_tidy_names <-
	unique_stations %>%
	select(full_name, Postcode, Brand, contains("new")) %>%
	distinct(full_name, .keep_all = T) %>%
	rename_at(.vars = vars(contains("new")),
	.funs = funs(sub("new_", "", .))) %>%
	write_csv("unique_stations_neat_parts.csv", na = "")



	# let's get the lat longs

	# save yourself some time (and using all your google cloud credits)
	# by downloading the lat longs, here
	# https://www.dropbox.com/s/zf4jcrqm9ayw4cp/googled%20addresses.csv?dl=1


	library(ggmap)
	register_google(key = "YoUWiLLNeedToUSEUrOWnKey")

	# bring in those that have already been searched...
	googled_addresses <- read_csv("googled addresses.csv")

	# only google those that don't appear in our existing collection
	tibble(address = setdiff(unique(unique_stations$new_address), googled_addresses$address)) %>%
	head(5) %>% # a safety measure -comment out when running.
	mutate(lat_long_data = map(address, ~geocode(.x, output = "latlona", source = "google"))) %>%
	unnest(lat_long_data) %>%
	bind_rows(googled_addresses) %>%
	write_csv("googled addresses.csv", na = "")



	# valid_googled addresses <-
	read_csv("googled addresses.csv") %>%
	filter(!is.na(address1)) %>%
	mutate(distance_from_median_point = geosphere::distHaversine(cbind(lon, lat), cbind(median(lon), median(lat)))) %>%
	filter(distance_from_median_point < 1.1E6) %>%
	select(-distance_from_median_point) %>%
	distinct(address, .keep_all = T) %>%
	write_csv("valid_googled addresses.csv", na = "")