Last active
September 18, 2024 11:16
-
-
Save chrishanretty/2b67686cc1fdf6661e9eec45d91f4f1e to your computer and use it in GitHub Desktop.
Analysis of Welsh Senedd constituency proposal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Purpose of this code: | |
### - To generate pairwise average travel times between Westminster constituencies | |
### - To sample contiguous pairings of Westminster constituencies | |
### - To evaluate the travel times between pairings | |
### - To evaluate the compactness of pairings | |
### Inputs | |
### - Travel time data from https://data.ubdc.ac.uk/datasets/public-transport-travel-time-matrices-for-great-britain-ttm-2023 | |
### - Shapefiles from https://geoportal.statistics.gov.uk/datasets/0d698c9712de4afcac9377367d831c1a_0/explore | |
### - lookups from https://geoportal.statistics.gov.uk/ | |
### Outputs | |
### - Sampled proposals | |
### - Map | |
### - Measures | |
### Load libraries | |
library(tidyverse) | |
library(arrow) | |
library(sf) | |
library(here) | |
library(spdep) | |
library(parallel) | |
library(ggrepel) | |
library(igraph) | |
library(combinat) | |
library(future) | |
library(future.apply) | |
library(tictoc) | |
library(redistmetrics) | |
here::i_am("analysis_v2.R") | |
set.seed(2843) | |
num_cores <- parallel::detectCores() - 2 | |
cl <- makeCluster(num_cores) | |
### Set up locations of different objects | |
ttm_path <- "~/Downloads/ttm/LSOA/LSOA records/ttm_pt" | |
oa11_wmin24_path <- "Output_area_(2011)_to_future_Parliamentary_Constituencies_Lookup_in_England_and_Wales.csv" | |
oa11_lsoa11_path <- "Postcode_to_Output_Area_to_Lower_Layer_Super_Output_Area_to_Middle_Layer_Super_Output_Area_to_Local_Authority_District_November_2018_Lookup_in_the_UK.csv" | |
if (!file.exists(oa11_wmin24_path) | !file.exists(oa11_lsoa11_path)) { | |
stop("You need to download the appropriate lookups and put them in this folder") | |
} | |
### locations for different temporary objects | |
pairwise_ttm_path <- here::here("working", "pairwise_ttms.rds") | |
### ############################################################################# | |
### Part 1: travel times | |
### ############################################################################# | |
if (file.exists(pairwise_ttm_path)) { | |
ttm <- readRDS(pairwise_ttm_path) | |
} else { | |
infiles <- list.files(ttm_path, | |
pattern = ".parquet", | |
full.names = TRUE) | |
read_and_filter <- function(file) { | |
read_parquet(file) |> | |
filter(grepl("^W", from_id)) |> | |
filter(grepl("^W", to_id)) |> | |
filter(time_of_day == "am") |> | |
dplyr::select(from_id, | |
to_id, | |
travel_time_p25) | |
} | |
### This takes a couple of minutes | |
ttm <- lapply(infiles, read_and_filter) |> | |
bind_rows() | |
### Amend the ttm to include values over 150 | |
ttm <- ttm |> | |
mutate(travel_time_p25 = coalesce(travel_time_p25, 151)) | |
### We now need to move between LSOA 2011 level and 2024 Westminster constituency level | |
### To do this we need to drop down to OA 2011 level and reaggregate | |
oa11_wmin24 <- read_csv(oa11_wmin24_path, | |
col_select = c(OA11CD, PCON25CD)) |> | |
filter(grepl("W", PCON25CD)) | |
### Assign each OA2011 to an LSOA2011 | |
oa11_lsoa11 <- read_csv(oa11_lsoa11_path, | |
col_select = c(oa11cd, lsoa11cd)) |> | |
distinct() |> | |
filter(grepl("W", oa11cd)) | |
### Join the two data sources | |
oa11_wmin24 <- left_join(oa11_wmin24, | |
oa11_lsoa11, | |
by = join_by(OA11CD == oa11cd)) | |
### Picking the most common Westminster constituency for each LSOA | |
most_common_or_random <- function(x) { | |
names(sort(table(x), decreasing = TRUE))[1] | |
} | |
lsoa11_wmin24 <- oa11_wmin24 |> | |
group_by(lsoa11cd) |> | |
summarize(PCON25CD = most_common_or_random(PCON25CD)) | |
### Add this on to the TTM data | |
ttm <- left_join(ttm, | |
lsoa11_wmin24 |> dplyr::select(lsoa11cd, from_pcon = PCON25CD), | |
by = join_by(from_id == lsoa11cd)) | |
ttm <- left_join(ttm, | |
lsoa11_wmin24 |> dplyr::select(lsoa11cd, to_pcon = PCON25CD), | |
by = join_by(to_id == lsoa11cd)) | |
### We now want to create all pairings of relevant constituencies | |
### These are all the Welsh constituencies, minus the | |
### constituencies which can be paired up on the basis of | |
### contiguity alone | |
constituencies <- lsoa11_wmin24 |> | |
filter(grepl("^W", PCON25CD)) |> | |
pull(PCON25CD) |> | |
unique() | |
combinations <- expand.grid(A = constituencies, | |
B = constituencies, | |
stringsAsFactors = FALSE) |> | |
filter(A <= B) | |
### Now write a function to interrogate the ttm data | |
get_ttm <- function(x, y) { | |
ttm |> | |
filter(from_pcon %in% x) |> | |
filter(to_pcon %in% y) |> | |
pull(travel_time_p25) |> | |
median(na.rm = TRUE) | |
} | |
combinations <- combinations |> | |
mutate(avg_travel_time = map2_dbl(A, B, get_ttm)) | |
### overwrite ttm | |
ttm <- combinations | |
rm(combinations, lsoa11_wmin24, oa11_wmin24) | |
saveRDS(ttm, file = pairwise_ttm_path) | |
} | |
### ############################################################################# | |
### Part 2: generating contiguous pairings | |
### ############################################################################# | |
seat_map <- read_sf("PCON_JULY_2024_UK_BFC.shp") | |
wales_seats <- seat_map %>% | |
filter(str_detect(PCON24CD, "W")) %>% | |
# Drop constituencies that have determined pairs | |
filter(!(PCON24CD %in% c( | |
"W07000112", # Ynys Môn, | |
"W07000083", # Bangor Aberconwy | |
"W07000094", # Clwyd East | |
"W07000095", # Clwyd North | |
"W07000082", # Alyn and Deeside | |
"W07000111", # Wrexham | |
"W07000096", # Dwyfor Meirionnydd | |
"W07000102" # Montgomeryshire and Glyndwr | |
))) %>% | |
mutate(constituency_index = row_number()) | |
# Note there's two other pairings that weren't previously excluded: | |
# Alyn and Deeside can only be paired with Wrexham given Clwyd pairing | |
# Dwyfor Meirionnydd can only be paired with Montgomeryshire and Glyndwr. | |
# At first glance it looks like Dwyfor Meirionnydd could be paired with | |
# Ceredigion Preseli, but there's a river between them with no bridge directly linking the two | |
# (This is apparently captured in the shape file - took me awhile to figure out why they couldn't be paired!) | |
## Calculate contiguity between constituencies | |
# Note There's some disagreement between st_relate() st_touches() around whether all four | |
# of the constituencies that meet at the nexus of Neath and Swansea East, Merthyr Tydfil | |
# and Aberdare, Rhondda and Ogmore, and Aberafan Maesteg touch each other. st_touch() thinks | |
# the diagonally opposite constituencies (e.g. Neath and Swansea East + Rhondda and Ogmore) 'touch' | |
# But st_relate() doesn't. The st_relate() version looks more sensible to me, but could perhaps be more | |
# precise as to what counts as contiguous | |
### This DE-9IM pattern means "The objects do not share any interior | |
### points, but their boundaries intersect" | |
contiguity_matrix <- st_relate(wales_seats, pattern = "F***1****") | |
## Create a graph from the contiguity matrix | |
contiguity_graph <- graph_from_adjacency_matrix(as.matrix(contiguity_matrix), | |
mode = "undirected") | |
## Find all contiguous pairs of constituencies | |
pairs <- as_data_frame(contiguity_graph, what = "edges") %>% as_tibble() | |
# Count number of pairs each constituency appears in and arrange so least common comes at the top | |
# Note - I *think* if you start with the seats with the fewest options you're less likely to end up with | |
# situations where you run out of possible pairings later on, which seems more efficient, but I'm just assuming that | |
pair_count <- bind_rows( | |
pairs %>% count(from) %>% rename("constituency" = "from"), | |
pairs %>% count(to) %>% rename("constituency" = "to")) %>% | |
group_by(constituency) %>% | |
summarise(n = sum(n)) %>% | |
arrange(n) | |
# Function to take a set of constituency pairings, find the remaining pairs that don't contain seats already in the set, | |
# choose the constituency with the fewest potential pairings and create new sets of pairings that add each potential pairing | |
# to the initial set of pairings. | |
add_next_seat <- function(sets, this_set) { | |
# Identify constituencies used in current set | |
constituencies_in_set <- c(sets[[this_set]]$from, sets[[this_set]]$to) %>% | |
unique() | |
# Find pairs that don't contain constituencies already in set | |
candidate_pairs <- pairs %>% | |
filter(!(from %in% constituencies_in_set)) %>% | |
filter(!(to %in% constituencies_in_set)) | |
# Check that there is at least one candidate pair, and if so add all pairings to the initial set | |
if(length(candidate_pairs$from) > 0) { | |
# Count constituency appearances | |
candidate_pair_count <- bind_rows( | |
candidate_pairs %>% count(from) %>% rename("constituency" = "from"), | |
candidate_pairs %>% count(to) %>% rename("constituency" = "to")) %>% | |
group_by(constituency) %>% | |
summarise(n = sum(n)) %>% | |
arrange(n) | |
# Pull out pairs to be added | |
next_pairs <- candidate_pairs %>% | |
filter(from == candidate_pair_count$constituency[1] | to == candidate_pair_count$constituency[1]) | |
# Turn into a list of one row tibbles | |
updating_set <- next_pairs %>% | |
rowwise() %>% | |
group_split() | |
# Add each pair to the initial set, returning a list of updated sets | |
updated_sets <- map(1:length(updating_set), function(this_one) { | |
return( | |
sets[[this_set]] %>% | |
bind_rows(updating_set[[this_one]]) | |
) | |
}) | |
return(updated_sets) | |
} | |
} | |
# Create starting set of seats, choosing the constituency with the fewest pairs (with ties broken however arrange() | |
# does it by default) | |
working_set <- pairs %>% | |
filter(from == pair_count$constituency[1] | to == pair_count$constituency[1]) %>% | |
rowwise() %>% | |
group_split() | |
# Then take the previous set of pairs, and map add_next_seat() over all of them, creating a new sets of pairings | |
# Then repeat until you've done all 12 seats (+ the 4 pre-determined ones) | |
current_seats <- 1 | |
while(current_seats < 12) { | |
working_set <- map(1:length(working_set), | |
add_next_seat, | |
sets = working_set) %>% | |
flatten() | |
current_seats <- current_seats + 1 | |
} | |
# Check for duplicates | |
# Function to remove duplicate sets | |
remove_duplicate_sets <- function(set_to_check) { | |
#Sort sets | |
sorted_sets <- future_lapply(set_to_check, function(tbl) { | |
tbl %>% arrange(from, to) | |
}) | |
# Filter out duplicates | |
unique_sets <- sorted_sets %>% unique() | |
return(unique_sets) | |
} | |
# Apparently there aren't any duplicates, but worth checking anyway! | |
all_potential_pairings <- remove_duplicate_sets(working_set) | |
# Add identifiers etc | |
constituency_codes <- wales_seats %>% | |
as_tibble() %>% | |
select(constituency_index, PCON24CD) | |
pre_set_pairs <- tibble( | |
PCON24CD = c("W07000112", # Ynys Môn, | |
"W07000083", # Bangor Aberconwy | |
"W07000094", # Clwyd East | |
"W07000095", # Clwyd North | |
"W07000082", # Alyn and Deeside | |
"W07000111", # Wrexham | |
"W07000096", # Dwyfor Meirionnydd | |
"W07000102"), # Montgomeryshire and Glyndwr | |
pair = c(1, 1, 2, 2, 3, 3, 4, 4)) | |
all_pairs <- lapply(all_potential_pairings, function(set) { | |
pre_set_pairs %>% | |
bind_rows( | |
set %>% | |
mutate(pair = 5:16) %>% | |
pivot_longer(-pair, values_to = "constituency_index") %>% | |
left_join(constituency_codes, by = join_by(constituency_index))) %>% | |
select(PCON24CD, pair) | |
}) | |
### Got to transform this into the way we expect other stuff to be listed | |
### This is a list of matrices, where the matrix is sorted alphabetical along the rows, then alphabetically along the first column. | |
all_pairs.bak <- all_pairs | |
all_pairs <- lapply(all_pairs, function(x) { | |
x <- x |> | |
group_by(pair) |> | |
reframe(A = PCON24CD[1], B = PCON24CD[2]) |> | |
as.matrix() | |
out <- x[,c("A", "B")] | |
### Sort them across the rows | |
out <- t(apply(out, 1, sort)) | |
### Sort them by order on the first column | |
out <- out[order(out[,1]),] | |
}) | |
dat <- tibble(id = 1:length(all_pairs), pairing = all_pairs) | |
### ############################################################################# | |
### Part 3: evaluating pairings by travel time | |
### ############################################################################# | |
avg_ttm <- function(pairing) { | |
as.data.frame(pairing) |> | |
left_join(ttm, by = join_by(V1 == A, V2 == B)) |> | |
pull(avg_travel_time) |> | |
median(na.rm = TRUE) | |
} | |
dat <- dat |> | |
mutate(avg_travel_time = map_dbl(pairing, avg_ttm)) | |
### ############################################################################# | |
### Part 4: evaluating pairings by compactness | |
### ############################################################################# | |
get_compactness <- function(pairing) { | |
require(tidyverse) | |
require(sf) | |
require(redistmetrics) | |
local <- pairing |> | |
as.data.frame() |> | |
mutate(grp = letters[seq_len(n())]) | |
local_map <- left_join(wales_seats, | |
local, | |
by = join_by(PCON24CD == V1)) | |
local_map <- left_join(local_map, | |
local, | |
by = join_by(PCON24CD == V2), | |
suffix = c(".v1", ".v2")) | |
local_map <- local_map |> | |
mutate(grp = ifelse(is.na(grp.v1), | |
grp.v2, | |
grp.v1)) | |
local_map <- local_map |> | |
mutate(grp = factor(grp), | |
grp_num = as.numeric(grp)) | |
mean(comp_polsby(plans = local_map$grp_num, shp = local_map)) | |
} | |
clusterExport(cl, c("wales_seats")) | |
dat$compactness <- parSapply(cl, dat$pairing, get_compactness) | |
saveRDS(dat, file = "output.rds") | |
### ############################################################################# | |
### Part 5: plotting things | |
### ############################################################################# | |
### Work out what the boundary commission proposal is | |
### Let's work out the Boundary Commission proposal | |
bc <- matrix(c( | |
## Alyn, Deeside and Wrexham | |
"W07000082", "W07000111", | |
## Dwford Meirionnydd, Montgomeryshire and Glyndwr | |
"W07000096", "W07000102", | |
## Ceredigion and Pembrokeshire | |
"W07000093", "W07000100", | |
## Carmarthenshire | |
"W07000087", "W07000098", | |
## Swansea West and Gower | |
"W07000108", "W07000097", | |
## Brecon, Radnor, Neath and Swansea East | |
"W07000085", "W07000103", | |
## Aberafan Maesteg, Rhondda, Ogmore | |
"W07000081", "W07000107", | |
## Merthyr Tydfil, Aberdare and Pontypridd | |
"W07000099", "W07000106", | |
## Blaneau Gwent, Rhymney and Caerphilly | |
"W07000084", "W07000088", | |
## Monmouthshire and Torfaen | |
"W07000101", "W07000109", | |
## Newport and Islwyn | |
"W07000104", "W07000105", | |
## Cardiff East and North | |
"W07000089", "W07000090", | |
## Cardiff West, South and Penarth | |
"W07000092", "W07000091", | |
## Vale of Glamorgan and Bridgend | |
"W07000110", "W07000086"), | |
nrow = 14, | |
ncol = 2, | |
byrow = TRUE) | |
### Sort it the same way the other things have been sorted | |
### Sort them across the rows | |
bc <- t(apply(bc, 1, sort)) | |
### Sort them by order on the first column | |
bc <- bc[order(bc[,1]),] | |
dat$is_bc <- sapply(dat$pairing, function(x)identical(x, bc)) | |
ggplot(dat, aes(x = compactness, y = avg_travel_time, colour = is_bc)) + | |
geom_point() + | |
scale_x_continuous("Compactness\n(Polsby-Propper metric, bigger is better)") + | |
scale_y_continuous("Average travel time\n(minutes, smaller is better)") + | |
scale_colour_manual("", | |
guide = "none", | |
values = c("darkgrey", "red")) + | |
theme_bw() | |
### ############################################################################# | |
### Part 6: mapping things | |
### ############################################################################# | |
report_pairing <- function(pairing) { | |
local <- pairing |> | |
as.data.frame() |> | |
mutate(grp = letters[seq_len(n())]) | |
lu <- wales_seats |> | |
st_drop_geometry() |> | |
dplyr::select(PCON24CD, PCON24NM) | |
local <- left_join(local, | |
lu, | |
by = join_by(V1 == PCON24CD)) | |
local <- left_join(local, | |
lu, | |
by = join_by(V2 == PCON24CD), | |
suffix = c(".v1", ".v2")) | |
local <- local |> | |
rowwise() |> | |
mutate(combined_name = ifelse(PCON24NM.v1 < PCON24NM.v2, | |
paste(PCON24NM.v1,PCON24NM.v2, sep = "; "), | |
paste(PCON24NM.v2,PCON24NM.v1, sep = "; "))) |> | |
ungroup() |> | |
arrange(combined_name) |> | |
pull(combined_name) | |
### Add on the two known pairings | |
knowns <- c("Bangor Aberconwy Ynys Môn", | |
"Clwyd") | |
local <- c(local, knowns) | |
local <- sort(local) | |
local <- local |> | |
paste0(collapse = "\n") | |
return(local) | |
} | |
plot_pairing <- function(pairing) { | |
local <- pairing |> | |
as.data.frame() |> | |
mutate(grp = letters[seq_len(n())]) | |
local_map <- left_join(wales_seats, | |
local, | |
by = join_by(PCON24CD == V1)) | |
local_map <- left_join(local_map, | |
local, | |
by = join_by(PCON24CD == V2), | |
suffix = c(".v1", ".v2")) | |
local_map <- local_map |> | |
mutate(grp = ifelse(is.na(grp.v1), | |
grp.v2, | |
grp.v1)) | |
if (FALSE) { | |
ggplot() + | |
geom_sf(data = local_map, aes(fill = grp)) + | |
ggrepel::geom_label_repel( | |
data = local_map, | |
aes(label = PCON24NM, geometry = geometry), | |
stat = "sf_coordinates", | |
size = 1.5 | |
) + | |
scale_fill_discrete(guide = "none") + | |
theme_minimal() | |
} else { | |
ggplot() + | |
geom_sf(data = local_map, aes(fill = grp)) + | |
geom_sf_label( | |
data = local_map, | |
aes(label = str_wrap(PCON24NM, 20)), | |
size = 1.5 | |
) + | |
scale_fill_discrete(guide = "none") + | |
theme_minimal() | |
} | |
} | |
### Report details of the most compact mix (which also happens to do well on travel time) | |
### | |
best <- which.max(dat$compactness) | |
plot_pairing(dat$pairing[[best]]) | |
cat(report_pairing(dat$pairing[[best]])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment