Skip to content

Instantly share code, notes, and snippets.

@jakeybob
Created September 12, 2020 20:27
Show Gist options
  • Save jakeybob/61bc2533e096cd28ad63d0743de137cd to your computer and use it in GitHub Desktop.
Save jakeybob/61bc2533e096cd28ad63d0743de137cd to your computer and use it in GitHub Desktop.
library(tidyverse)
library(stats19)
library(ckanr)
# libs for mapping accident lat/long to datazone for Glasgow deprivation analysis
library(sf)
library(rmapshaper)
# below libs useful for speeding up mapping accidents over water to nearest datazone
library(foreach)
library(doParallel)
registerDoParallel(parallel::detectCores())
### LOOKUPS & RECODES ####
ckanr_setup(url = "https://www.opendata.nhs.scot/")
res_ca_ids <- resource_show(id = "967937c4-8d67-4f39-974f-fd58c4acfda5")
ca_ids <- ckan_fetch(x=res_ca_ids$url)
res_dz_ids <- resource_show(id = "395476ab-0720-4740-be07-ff4467141352")
dz_ids <- ckan_fetch(x=res_dz_ids$url)
res_simd <- resource_show(id = "cadf715a-c365-4dcf-a6e0-acd7e3af21ec")
simd <- ckan_fetch(x=res_simd$url)
res_ca_pop <- resource_show(id = "09ebfefb-33f4-4f6a-8312-2d14e2b02ace")
ca_pop <- ckan_fetch(x=res_ca_pop$url)
res_dz_pop <- resource_show(id = "c505f490-c201-44bd-abd1-1bd7a64285ee")
dz_pop <- ckan_fetch(x=res_dz_pop$url)
area_recodes <- c("Glasgow City" = "Glasgow",
"City of Edinburgh" = "Edinburgh",
"Edinburgh, City of" = "Edinburgh",
"Aberdeen City" = "Aberdeen",
"Dundee City" = "Dundee",
"Newcastle upon Tyne" = "Newcastle",
"Bristol, City of" = "Bristol",
"Great Britain" = "UK")
scot_areas <- c("Scotland", "Glasgow", "Edinburgh", "Aberdeen", "Dundee")
gcr_areas <- c("Scotland", "Glasgow", "Inverclyde", "South Lanarkshire", "North Lanarkshire", "East Dunbartonshire",
"West Dunbartonshire", "East Renfrewshire", "Renfrewshire")
uk_areas <- c("Edinburgh", "Leeds", "Glasgow", "Sheffield", "Newcastle", "Nottingham", "Liverpool", "Birmingham",
"Manchester", "Bristol")
all_areas <- unique(c(scot_areas, gcr_areas, uk_areas, "Scotland", "UK"))
#### GET STATS19 DATA ####
years <- 2014:2017
# download all accident and casualty files to temp directory, to be read in and combined in next step
# note: some years will download as a collection, e.g. downloading 1997 will download a 1979--2004 combined file
# note: only 2014 onward has age of casualty, which we need
# note: at time of writing (Sept 2020) 2017 is latest full year of validated data
for(year in years){
dl_stats19(year = year, type = "Casualties", ask = FALSE)
dl_stats19(year = year, type = "Accidents", ask = FALSE)
}
# combine data from different years, format, and select incidents from our areas of interest only
for(year in years){
if(year == years[1]){df <- tibble()}
df_to_append <- read_casualties(year) %>%
left_join(read_accidents(year) %>% select(accident_index, local_authority_district, latitude, longitude)) %>%
select(age_of_casualty, casualty_type, casualty_severity, local_authority_district, latitude, longitude) %>%
mutate(year = year) %>%
mutate(area = recode(local_authority_district, !!!area_recodes)) %>%
filter(area %in% c(scot_areas, gcr_areas)) %>%
mutate(age_group = case_when(age_of_casualty >= 16 ~ "adult",
age_of_casualty >= 5 & age_of_casualty <= 15 ~ "child",
TRUE ~ "<5")) %>%
mutate(casualty_type = case_when(casualty_type == "Pedestrian" ~ "Pedestrian",
casualty_type == "Cyclist" ~ "Cyclist",
TRUE ~ "Other"))
df <- df %>% bind_rows(df_to_append)
if(year == years[length(years)]){
write_rds(df, "stats19.rds")
rm(df_to_append)}
}
df <- read_rds("stats19.rds")
#### GLASGOW SIMD ANALYSIS ####
# To look at deprivation for Glasgow we need to map each accident's lat/long onto a datazone.
# datazone shapefile
# https://spatialdata.gov.scot/geonetwork/srv/eng/catalog.search;jsessionid=7BFD03DFFDA5CD66CC065C75FBAD2172#/metadata/389787c0-697d-4824-9ca9-9ce8cb79d6f5
scot_map_2011 <- st_read("SG_DataZoneBdry_2011") %>%
st_transform(crs="+proj=longlat +datum=WGS84") %>%
ms_simplify(., drop_null_geometries = TRUE, keep = 1) %>%
select(DataZone, geometry) %>%
rename(datazone_2011 = DataZone)
# dataframe of just Glasgow's accidents
df_gla <- df %>%
filter(area == "Glasgow") %>%
filter(is.na(latitude) == FALSE, is.na(longitude) == FALSE)
# lat/long coords as spatial object points
gla_accident_coords <- st_as_sf(df_gla %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")
# construct vector of datazones which the points intersect. Will be one datazone per point (or none ... see next step)
dz_intersects <- st_intersects(gla_accident_coords, scot_map_2011, sparse = TRUE, prepared = TRUE)
dz_intersects <-dz_intersects %>% sapply(function(x) if (length(x) == 0) NA_integer_ else x ) # convert to vector with NAs if point not in map
# add datazone IDs into Glasgow data
df_gla<- df_gla %>%
mutate(datazone_2011 = scot_map_2011[dz_intersects,]$datazone_2011)
# deal with accidents that don't appear in the datazone geographies; these will occur over water or be outwith Glasgow
# Will choose nearest datazone and re-insert these back in, then filter out non-Glasgow datazones
df_gla_missing <- df_gla %>% filter(is.na(datazone_2011) == TRUE)
df_gla <- df_gla %>% filter(is.na(datazone_2011) == FALSE) # remove these from main data
gla_missing_coords <- st_as_sf(df_gla_missing %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")
gla_missing_datazones <- foreach(i = 1:dim(gla_missing_coords)[1]) %dopar% {
distances_to_dzs <- st_distance(gla_missing_coords[i,], scot_map_2011$geometry, by_element = TRUE)
scot_map_2011[which(distances_to_dzs == min(distances_to_dzs)),]$datazone_2011
}
df_gla_missing$datazone_2011 <- gla_missing_datazones %>% unlist()
# add those that had missing datazones back in
df_gla <- df_gla %>%
bind_rows(df_gla_missing)
gla_dz_simd <- dz_ids %>%
mutate(area = recode(CAName, !!!area_recodes)) %>%
filter(area == "Glasgow") %>%
select(DataZone) %>%
left_join(simd) %>%
select(DataZone, SIMD2016CountryQuintile) %>%
rename(datazone_2011 = DataZone, simd = SIMD2016CountryQuintile)
df_gla <- df_gla %>% inner_join(gla_dz_simd) # drops those that have non-Glasgow datazones
write_rds(df_gla, "stats19_gla.rds")
df_gla <- read_rds("stats19_gla.rds")
#### PLOTS ####
# Populations lookups for calculating rates per 100k pop
ca_codes <- ca_ids %>%
mutate(area = recode(CAName, !!!area_recodes)) %>%
filter(area %in% c(scot_areas, gcr_areas)) %>%
filter_at(vars(contains("Archived")), ~is.na(.)) %>%
select(CA, area)
ca_populations <- ca_pop %>%
rename(year = Year) %>%
filter(year %in% years) %>%
filter(CA %in% ca_codes$CA,
Sex == "All") %>%
mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
age_child = select(., Age5:Age15) %>% rowSums(),
age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
left_join(ca_codes) %>%
rename(age_all = AllAges) %>%
select(year, area, contains("age_")) %>%
pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group")
gla_simd_populations <- dz_pop %>%
rename(datazone_2011 = DataZone,
year = Year) %>%
filter(year %in% years,
Sex == "All") %>%
inner_join(gla_dz_simd) %>%
mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
age_child = select(., Age5:Age15) %>% rowSums(),
age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
rename(age_all = AllAges) %>%
select(year, contains("age_"), simd) %>%
pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group") %>%
group_by(year, simd, age_group) %>%
summarise(pop = sum(pop)) %>%
mutate(area = "Glasgow")
# LA level adult / child casualties rate
df <- read_rds("stats19.rds")
df %>%
group_by(area, year, age_group) %>%
summarise(casualties = n()) %>%
left_join(ca_populations) %>% View()
mutate(rate = 1e5 * casualties / pop) %>%
filter(age_group %in% c("adult", "child")) %>%
ggplot(aes(x = year, y = rate, colour = area)) +
geom_line() + geom_point() +
facet_wrap(~age_group) +
theme_bw()
plotly::ggplotly()
# Glasgow severity
df_gla <- read_rds("stats19_gla.rds")
df_gla %>%
group_by(year) %>%
summarise(casualties_all = n(), area = first(area)) %>%
left_join(
df_gla %>%
filter(casualty_severity != "Slight") %>%
group_by(year) %>%
summarise(casualties_KSI = n()), area = first(area)) %>%
pivot_longer(contains("casualties_"), values_to = "casualties", names_prefix = "casualties_", names_to = "type") %>%
left_join(ca_populations %>% filter(age_group == "all")) %>%
mutate(rate = 1e5 * casualties / pop) %>%
ggplot(aes(x = year, y = rate, colour = type)) +
geom_line() + geom_point() +
theme_bw()
plotly::ggplotly()
# Glasgow by mode
df_gla <- read_rds("stats19_gla.rds")
df_gla %>%
filter(age_group %in% c("adult", "child")) %>%
group_by(year, casualty_type, age_group) %>%
summarise(casualties = n(), area = first(area)) %>%
left_join(ca_populations %>% filter(age_group %in% c("adult", "child"))) %>%
mutate(rate = 1e5 * casualties / pop) %>%
ggplot(aes(x = year, y = rate, colour = casualty_type)) +
geom_line() + geom_point() +
facet_wrap(~age_group) +
theme_bw()
plotly::ggplotly()
# Glasgow by deprivation
df_gla %>%
filter(age_group %in% c("adult", "child")) %>%
group_by(year, simd, age_group) %>%
summarise(casualties = n()) %>%
left_join(gla_simd_populations) %>%
mutate(rate = 1e5 * casualties / pop) %>%
mutate(simd = as.factor(simd)) %>%
ggplot(aes(x = year, y = rate, colour = simd)) +
geom_line() + geom_point() +
facet_wrap(~age_group) +
theme_bw()
plotly::ggplotly()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment