Tadge-Analytics/Custom Tours R processing

## Custom Tours R processing
# first, run the gist at:
# https://gist.github.com/Tadge-Analytics/63f75895f0b2e76dae60c1bd5ce8fec7

library(tidyverse)

vcglr_data <- readRDS("rds files/vcglr_data.rds")


# same location for start and end
# note there cannot be a Licence Num (in the VCGLR) that is 1 or 2
# set time at start and end as 25 minutes

start_and_end_locations <- tibble(`Licence Num` = c(1L,2L),
                                  Latitude = c(-37.8108, -37.8108),
                                  Longitude = c(144.9653, 144.9653),
                                  ideal_time_hrs = 0.25)


# we want a sample.. but for aesthetic reasons...
# let's get equal portions near, not-so-near and far... from QV


set.seed(123)

sample_of_sites <- vcglr_data %>%
  mutate(distance_m = geosphere::distHaversine(cbind(Longitude, Latitude),
                                             cbind(start_and_end_locations[[1, "Longitude"]],
                                                   start_and_end_locations[[1, "Latitude"]])),
         dist_groupings = cut(distance_m, 3, labels = c("low","med","high"))) %>%
  filter(dist_groupings != "high") %>%
  group_by(dist_groupings) %>%
  sample_n(15) %>%
  pull(`Licence Num`)


# let's prepare the useful info for our sampled sites

selected_sites_details <- vcglr_data %>%
  select(`Licence Num`, Latitude, Longitude) %>%
  filter(`Licence Num` %in% sample_of_sites) %>%

  # let's give some fake ideal times for spending at these desitinations
  # fake popularity and fake costs, also

  mutate(ideal_time_hrs = sample(seq(1, 4, 0.5), n(), replace = T),
         dest_popularity = runif(n(), 1, 5),
         dest_cost = runif(n(), 25, 150)) %>%

  bind_rows(start_and_end_locations)


pre_dest_combos <- selected_sites_details %>%
  select(`Licence Num`, Latitude, Longitude) %>%
  mutate(location = paste(Latitude, Longitude, sep = "+"))


dest_combos <- pre_dest_combos %>%
  select(-`Licence Num`) %>%
  mutate(data = list(rename_all(., function(x) paste0(x, "1")))) %>%
  unnest(data) %>%
  filter(location != location1)


# let's get the travel times (rather than distance) between all of these...
# using the google maps api,

library(gmapsdistance)
gmapsdistance::set.api.key("YoUWiLLNeedToUSEUrOWnKey")


# let's get onto googling these travel times!
# but we don't need to find the travel time of the same trip, both ways
# let's also filter out the travel times we've already googled, on previous occaisions


googled_times <- read_rds("rds files/travel_times.rds")


unique_trips  <- dest_combos %>%
  distinct(location, location1, .keep_all = T) %>%

  # let's filter out the travel times we've already googled
  anti_join(googled_times, by = c("location" = "1", "location1" = "2")) %>%

  # let's remove those "same trips both ways" duplicates
  rowid_to_column() %>%
  select(rowid, location, location1) %>%
  gather(key, value, -rowid) %>%
  arrange(value) %>%
  select(-key) %>%
  group_by(rowid) %>%
  mutate(rank = row_number()) %>%
  spread(rank, value) %>%
  ungroup() %>%
  distinct(`1`, `2`, .keep_all = T)

unique_trips %>%
  head(5) %>%  # this is a safety measure
  mutate(travel_time = map2_dbl(`1`, `2`,
                                ~gmapsdistance(origin = .x,
                                               destination = .y,
                                               mode = "driving") %>%
                                  first())) %>%
  bind_rows(rename(., `2` =`1`,
                   `1` = `2`)) %>%
  select(-rowid) %>%
  bind_rows(googled_times) %>%
  write_rds("rds files/travel_times.rds")

# 10 rows took 5.93 seconds
# therefore 465 rows will take 46.5*5.93   == 4.6 minutes (plus cost $$$?)


googled_times <- read_rds("rds files/travel_times.rds")

all_trips_with_travel_times <- dest_combos %>%
  inner_join(googled_times, by = c("location" = "1", "location1" = "2"))


# for each unique combination of destinations...
# what is the travel path that will be taken?
# this is purely so that the map displayed will be accurate... instead of just a straight line...
# this could be a grovy set action... so that we see the full path if we click on a path...

library(googleway)
set_key("YoUWiLLNeedToUSEUrOWnKey")

googled_paths <- read_rds("rds files/googled_paths.rds")

unique_trip_paths <- dest_combos %>%
    anti_join(googled_paths, by = c("location" = "1", "location1" = "2"))


# x <-
unique_trip_paths %>%
  # head(5) %>%   # safety measure
  rename(`1` = location,
         `2` = location1) %>%
  mutate(some_list_col = map2(`1`, `2`, ~google_directions(origin = .x,
                                                           destination = .y,
                                                           mode = "driving") %>%
                                .$routes%>%
                                .$legs %>%
                                .[[1]] %>%
                                .$steps %>%
                                .[[1]])) %>%

  bind_rows(googled_paths) %>%
  write_rds("rds files/googled_paths.rds")


# how long does this one take???


googled_paths <- read_rds("rds files/googled_paths.rds") %>%
  mutate(path_id = row_number(),
         some_list_col = map(some_list_col, ~do.call(data.frame, c(.x, stringsAsFactors = FALSE)))) %>%
  unnest(cols = c(some_list_col)) %>%
  select(-distance.text, -distance.value, -duration.text, -duration.value, -html_instructions, -points, -travel_mode, -maneuver) %>%
  group_by(path_id) %>%
  mutate(leg_id = row_number())


zero_leg <- googled_paths %>%
  distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
  mutate(leg_id = 0,
         end_location.lat = start_location.lat,
         end_location.lng = start_location.lng,
         start_location.lat = Latitude,
         start_location.lng = Longitude)


last_leg <- googled_paths %>%
  arrange(desc(leg_id)) %>%
  distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
  mutate(leg_id = leg_id + 1,
         start_location.lat = end_location.lat,
         start_location.lng = end_location.lng,
         end_location.lat = Latitude1,
         end_location.lat = Longitude1)


googled_paths %>%
  bind_rows(zero_leg,
            last_leg) %>%
  ungroup() %>%
  inner_join(
    pre_dest_combos %>% select(`Licence Num`, location),
    by = c("1" = "location")) %>%
  inner_join(
    pre_dest_combos %>% select(`Licence Num`, location),
    by = c("2" = "location")) %>%
  select(`Licence Num.x`, `Licence Num.y`, leg_id, start_location.lat, start_location.lng, end_location.lat, end_location.lng) %>%
  write_csv("output csvs/googled_paths.csv", na = "")


# from here?
# let's work out all possible combinations of destinations, for trips with N number of stops


 # i <- 1
for (i in 1:4) {

# let's create a list of possible tours
number_of_stops <- i


tictoc::tic()
all_combos <- as_tibble(
  expand.grid(rep(list(sample_of_sites), number_of_stops))) %>%
  mutate(tour_id = row_number()) %>%
  gather(key, value, -tour_id) %>%
  mutate(key = as.integer(str_remove(key, "Var"))) %>%
  group_by(tour_id) %>%
  filter(n_distinct(value) == number_of_stops) %>% # only keep the tours with unique stops (no point going to the same place twice in the one tour, right?)

  bind_rows(., distinct(., tour_id) %>%          # add in start and end locations
              crossing(tibble(key = c(0L, number_of_stops + 1L),
                              value = c(1L, 2L)))) %>%
  arrange(key) %>%
  mutate(prev_dest = lag(value)) %>%

  left_join(selected_sites_details, by = c("value" = "Licence Num")) %>%
  left_join(selected_sites_details %>% select(-ideal_time_hrs, -dest_cost, -dest_popularity), by = c("prev_dest" = "Licence Num")) %>%

  left_join(all_trips_with_travel_times %>%
               distinct() %>%
               select(-location, -location1), by = c("Latitude.x" = "Latitude",
                                                     "Longitude.x" = "Longitude",
                                                     "Latitude.y" = "Latitude1",
                                                     "Longitude.y" = "Longitude1"))

print(tictoc::toc())


tours_with_calc <- all_combos %>%
  group_by(tour_id) %>%
  summarise(time_at_dests = sum(ideal_time_hrs),
            travel_time = sum(travel_time, na.rm = T)/3600,
            total_tour_time = time_at_dests + travel_time,
            transit_percentage = travel_time/total_tour_time,
            total_cost = sum(dest_cost, na.rm = T),
            avg_popularity = mean(dest_popularity, na.rm = T))


tours_with_calc %>%
  ggplot() +
  aes(total_tour_time) +
  geom_histogram()

# do any viable tours contain the very same stops?
# keep only the one with the shortest total travel time.


fastest_unique <- all_combos %>%
  select(tour_id, value) %>%
  arrange(value) %>%
  group_by(tour_id) %>%
  mutate(arranged_order = row_number()) %>%
  spread(arranged_order, value) %>%
  group_by_at(vars(-tour_id)) %>%
  left_join(tours_with_calc %>% select(tour_id, total_tour_time), by = "tour_id") %>%
  arrange(total_tour_time) %>%
  group_by_at(vars(-tour_id, -total_tour_time, -`1`, -`2`)) %>% # only keep the tour with the least time (or, if they're the same, keep the first)
  slice(1)


# which are the viable tours?

viable_tours <- tours_with_calc %>%
  semi_join(fastest_unique, by = "tour_id") %>%

# bring these filters back...
# after you've fixed the destinations sampling to avoid such far away places...
  # filter(total_tour_time < 9) %>%
  # filter(total_tour_time > 2.5) %>%
  # filter(transit_percentage < 0.3) %>%
  mutate(number_of_stops = i)


# ideally you would also add in a "contains these types" column...
# but this might be better at the dashbaord stage

filtered_stops <- all_combos %>%
  semi_join(viable_tours, by = "tour_id") %>%
  select(-contains(".y"), -prev_dest) %>%
  rename(Latitude = Latitude.x,
         Longitude = Longitude.x,
         stop_number = key,
         dest_id = value) %>%
  left_join(vcglr_data %>% mutate(`Licence Num` = as.integer(`Licence Num`)) %>% select(`Licence Num`, `Trading As`, Category), by = c("dest_id" = "Licence Num"))


# save files to disk

write_csv(viable_tours, paste0("output csvs/viable_tours_", i, ".csv"), na = "")
write_csv(filtered_stops, paste0("output csvs/filtered_stops_", i, ".csv"), na = "")

rm(viable_tours, filtered_stops)


}
	# first, run the gist at:
	# https://gist.github.com/Tadge-Analytics/63f75895f0b2e76dae60c1bd5ce8fec7

	library(tidyverse)

	vcglr_data <- readRDS("rds files/vcglr_data.rds")


	# same location for start and end
	# note there cannot be a Licence Num (in the VCGLR) that is 1 or 2
	# set time at start and end as 25 minutes

	start_and_end_locations <- tibble(`Licence Num` = c(1L,2L),
	Latitude = c(-37.8108, -37.8108),
	Longitude = c(144.9653, 144.9653),
	ideal_time_hrs = 0.25)




	# we want a sample.. but for aesthetic reasons...
	# let's get equal portions near, not-so-near and far... from QV


	set.seed(123)

	sample_of_sites <- vcglr_data %>%
	mutate(distance_m = geosphere::distHaversine(cbind(Longitude, Latitude),
	cbind(start_and_end_locations[[1, "Longitude"]],
	start_and_end_locations[[1, "Latitude"]])),
	dist_groupings = cut(distance_m, 3, labels = c("low","med","high"))) %>%
	filter(dist_groupings != "high") %>%
	group_by(dist_groupings) %>%
	sample_n(15) %>%
	pull(`Licence Num`)




	# let's prepare the useful info for our sampled sites

	selected_sites_details <- vcglr_data %>%
	select(`Licence Num`, Latitude, Longitude) %>%
	filter(`Licence Num` %in% sample_of_sites) %>%

	# let's give some fake ideal times for spending at these desitinations
	# fake popularity and fake costs, also

	mutate(ideal_time_hrs = sample(seq(1, 4, 0.5), n(), replace = T),
	dest_popularity = runif(n(), 1, 5),
	dest_cost = runif(n(), 25, 150)) %>%

	bind_rows(start_and_end_locations)



	pre_dest_combos <- selected_sites_details %>%
	select(`Licence Num`, Latitude, Longitude) %>%
	mutate(location = paste(Latitude, Longitude, sep = "+"))


	dest_combos <- pre_dest_combos %>%
	select(-`Licence Num`) %>%
	mutate(data = list(rename_all(., function(x) paste0(x, "1")))) %>%
	unnest(data) %>%
	filter(location != location1)






	# let's get the travel times (rather than distance) between all of these...
	# using the google maps api,

	library(gmapsdistance)
	gmapsdistance::set.api.key("YoUWiLLNeedToUSEUrOWnKey")


	# let's get onto googling these travel times!
	# but we don't need to find the travel time of the same trip, both ways
	# let's also filter out the travel times we've already googled, on previous occaisions


	googled_times <- read_rds("rds files/travel_times.rds")


	unique_trips <- dest_combos %>%
	distinct(location, location1, .keep_all = T) %>%

	# let's filter out the travel times we've already googled
	anti_join(googled_times, by = c("location" = "1", "location1" = "2")) %>%

	# let's remove those "same trips both ways" duplicates
	rowid_to_column() %>%
	select(rowid, location, location1) %>%
	gather(key, value, -rowid) %>%
	arrange(value) %>%
	select(-key) %>%
	group_by(rowid) %>%
	mutate(rank = row_number()) %>%
	spread(rank, value) %>%
	ungroup() %>%
	distinct(`1`, `2`, .keep_all = T)

	unique_trips %>%
	head(5) %>% # this is a safety measure
	mutate(travel_time = map2_dbl(`1`, `2`,
	~gmapsdistance(origin = .x,
	destination = .y,
	mode = "driving") %>%
	first())) %>%
	bind_rows(rename(., `2` =`1`,
	`1` = `2`)) %>%
	select(-rowid) %>%
	bind_rows(googled_times) %>%
	write_rds("rds files/travel_times.rds")

	# 10 rows took 5.93 seconds
	# therefore 465 rows will take 46.5*5.93 == 4.6 minutes (plus cost $$$?)


	googled_times <- read_rds("rds files/travel_times.rds")

	all_trips_with_travel_times <- dest_combos %>%
	inner_join(googled_times, by = c("location" = "1", "location1" = "2"))







	# for each unique combination of destinations...
	# what is the travel path that will be taken?
	# this is purely so that the map displayed will be accurate... instead of just a straight line...
	# this could be a grovy set action... so that we see the full path if we click on a path...

	library(googleway)
	set_key("YoUWiLLNeedToUSEUrOWnKey")

	googled_paths <- read_rds("rds files/googled_paths.rds")

	unique_trip_paths <- dest_combos %>%
	anti_join(googled_paths, by = c("location" = "1", "location1" = "2"))


	# x <-
	unique_trip_paths %>%
	# head(5) %>% # safety measure
	rename(`1` = location,
	`2` = location1) %>%
	mutate(some_list_col = map2(`1`, `2`, ~google_directions(origin = .x,
	destination = .y,
	mode = "driving") %>%
	.$routes%>%
	.$legs %>%
	.[[1]] %>%
	.$steps %>%
	.[[1]])) %>%

	bind_rows(googled_paths) %>%
	write_rds("rds files/googled_paths.rds")


	# how long does this one take???




	googled_paths <- read_rds("rds files/googled_paths.rds") %>%
	mutate(path_id = row_number(),
	some_list_col = map(some_list_col, ~do.call(data.frame, c(.x, stringsAsFactors = FALSE)))) %>%
	unnest(cols = c(some_list_col)) %>%
	select(-distance.text, -distance.value, -duration.text, -duration.value, -html_instructions, -points, -travel_mode, -maneuver) %>%
	group_by(path_id) %>%
	mutate(leg_id = row_number())



	zero_leg <- googled_paths %>%
	distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
	mutate(leg_id = 0,
	end_location.lat = start_location.lat,
	end_location.lng = start_location.lng,
	start_location.lat = Latitude,
	start_location.lng = Longitude)


	last_leg <- googled_paths %>%
	arrange(desc(leg_id)) %>%
	distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
	mutate(leg_id = leg_id + 1,
	start_location.lat = end_location.lat,
	start_location.lng = end_location.lng,
	end_location.lat = Latitude1,
	end_location.lat = Longitude1)


	googled_paths %>%
	bind_rows(zero_leg,
	last_leg) %>%
	ungroup() %>%
	inner_join(
	pre_dest_combos %>% select(`Licence Num`, location),
	by = c("1" = "location")) %>%
	inner_join(
	pre_dest_combos %>% select(`Licence Num`, location),
	by = c("2" = "location")) %>%
	select(`Licence Num.x`, `Licence Num.y`, leg_id, start_location.lat, start_location.lng, end_location.lat, end_location.lng) %>%
	write_csv("output csvs/googled_paths.csv", na = "")





	# from here?
	# let's work out all possible combinations of destinations, for trips with N number of stops


	# i <- 1
	for (i in 1:4) {

	# let's create a list of possible tours
	number_of_stops <- i


	tictoc::tic()
	all_combos <- as_tibble(
	expand.grid(rep(list(sample_of_sites), number_of_stops))) %>%
	mutate(tour_id = row_number()) %>%
	gather(key, value, -tour_id) %>%
	mutate(key = as.integer(str_remove(key, "Var"))) %>%
	group_by(tour_id) %>%
	filter(n_distinct(value) == number_of_stops) %>% # only keep the tours with unique stops (no point going to the same place twice in the one tour, right?)

	bind_rows(., distinct(., tour_id) %>% # add in start and end locations
	crossing(tibble(key = c(0L, number_of_stops + 1L),
	value = c(1L, 2L)))) %>%
	arrange(key) %>%
	mutate(prev_dest = lag(value)) %>%

	left_join(selected_sites_details, by = c("value" = "Licence Num")) %>%
	left_join(selected_sites_details %>% select(-ideal_time_hrs, -dest_cost, -dest_popularity), by = c("prev_dest" = "Licence Num")) %>%

	left_join(all_trips_with_travel_times %>%
	distinct() %>%
	select(-location, -location1), by = c("Latitude.x" = "Latitude",
	"Longitude.x" = "Longitude",
	"Latitude.y" = "Latitude1",
	"Longitude.y" = "Longitude1"))

	print(tictoc::toc())


	tours_with_calc <- all_combos %>%
	group_by(tour_id) %>%
	summarise(time_at_dests = sum(ideal_time_hrs),
	travel_time = sum(travel_time, na.rm = T)/3600,
	total_tour_time = time_at_dests + travel_time,
	transit_percentage = travel_time/total_tour_time,
	total_cost = sum(dest_cost, na.rm = T),
	avg_popularity = mean(dest_popularity, na.rm = T))





	tours_with_calc %>%
	ggplot() +
	aes(total_tour_time) +
	geom_histogram()

	# do any viable tours contain the very same stops?
	# keep only the one with the shortest total travel time.


	fastest_unique <- all_combos %>%
	select(tour_id, value) %>%
	arrange(value) %>%
	group_by(tour_id) %>%
	mutate(arranged_order = row_number()) %>%
	spread(arranged_order, value) %>%
	group_by_at(vars(-tour_id)) %>%
	left_join(tours_with_calc %>% select(tour_id, total_tour_time), by = "tour_id") %>%
	arrange(total_tour_time) %>%
	group_by_at(vars(-tour_id, -total_tour_time, -`1`, -`2`)) %>% # only keep the tour with the least time (or, if they're the same, keep the first)
	slice(1)


	# which are the viable tours?

	viable_tours <- tours_with_calc %>%
	semi_join(fastest_unique, by = "tour_id") %>%

	# bring these filters back...
	# after you've fixed the destinations sampling to avoid such far away places...
	# filter(total_tour_time < 9) %>%
	# filter(total_tour_time > 2.5) %>%
	# filter(transit_percentage < 0.3) %>%
	mutate(number_of_stops = i)


	# ideally you would also add in a "contains these types" column...
	# but this might be better at the dashbaord stage

	filtered_stops <- all_combos %>%
	semi_join(viable_tours, by = "tour_id") %>%
	select(-contains(".y"), -prev_dest) %>%
	rename(Latitude = Latitude.x,
	Longitude = Longitude.x,
	stop_number = key,
	dest_id = value) %>%
	left_join(vcglr_data %>% mutate(`Licence Num` = as.integer(`Licence Num`)) %>% select(`Licence Num`, `Trading As`, Category), by = c("dest_id" = "Licence Num"))



	# save files to disk

	write_csv(viable_tours, paste0("output csvs/viable_tours_", i, ".csv"), na = "")
	write_csv(filtered_stops, paste0("output csvs/filtered_stops_", i, ".csv"), na = "")

	rm(viable_tours, filtered_stops)


	}