jakeybob/stats19_analysis.R

## stats19_analysis.R
library(tidyverse)
library(stats19)
library(ckanr)
# libs for mapping accident lat/long to datazone for Glasgow deprivation analysis
library(sf)
library(rmapshaper)
# below libs useful for speeding up mapping accidents over water to nearest datazone
library(foreach)
library(doParallel)
registerDoParallel(parallel::detectCores())

### LOOKUPS & RECODES ####
ckanr_setup(url = "https://www.opendata.nhs.scot/")
res_ca_ids <- resource_show(id = "967937c4-8d67-4f39-974f-fd58c4acfda5")
ca_ids <- ckan_fetch(x=res_ca_ids$url)
res_dz_ids <- resource_show(id = "395476ab-0720-4740-be07-ff4467141352")
dz_ids <- ckan_fetch(x=res_dz_ids$url)
res_simd <- resource_show(id = "cadf715a-c365-4dcf-a6e0-acd7e3af21ec")
simd <- ckan_fetch(x=res_simd$url)
res_ca_pop <- resource_show(id = "09ebfefb-33f4-4f6a-8312-2d14e2b02ace")
ca_pop <- ckan_fetch(x=res_ca_pop$url)
res_dz_pop <- resource_show(id = "c505f490-c201-44bd-abd1-1bd7a64285ee")
dz_pop <- ckan_fetch(x=res_dz_pop$url)

area_recodes <- c("Glasgow City" = "Glasgow",
                  "City of Edinburgh" = "Edinburgh",
                  "Edinburgh, City of" = "Edinburgh",
                  "Aberdeen City" = "Aberdeen",
                  "Dundee City" = "Dundee",
                  "Newcastle upon Tyne" = "Newcastle",
                  "Bristol, City of" = "Bristol",
                  "Great Britain" = "UK")

scot_areas <- c("Scotland", "Glasgow", "Edinburgh", "Aberdeen", "Dundee")

gcr_areas <- c("Scotland", "Glasgow", "Inverclyde", "South Lanarkshire", "North Lanarkshire", "East Dunbartonshire",
               "West Dunbartonshire", "East Renfrewshire", "Renfrewshire")

uk_areas <- c("Edinburgh", "Leeds", "Glasgow", "Sheffield", "Newcastle", "Nottingham", "Liverpool", "Birmingham",
              "Manchester", "Bristol")
all_areas <- unique(c(scot_areas, gcr_areas, uk_areas, "Scotland", "UK"))


#### GET STATS19 DATA ####

years <- 2014:2017
# download all accident and casualty files to temp directory, to be read in and combined in next step
# note: some years will download as a collection, e.g. downloading 1997 will download a 1979--2004 combined file
# note: only 2014 onward has age of casualty, which we need
# note: at time of writing (Sept 2020) 2017 is latest full year of validated data
for(year in years){
  dl_stats19(year = year, type = "Casualties", ask = FALSE)
  dl_stats19(year = year, type = "Accidents", ask = FALSE)
}

# combine data from different years, format, and select incidents from our areas  of interest only
for(year in years){
  if(year == years[1]){df <- tibble()}

  df_to_append <- read_casualties(year) %>%
    left_join(read_accidents(year) %>% select(accident_index, local_authority_district, latitude, longitude)) %>%
    select(age_of_casualty, casualty_type, casualty_severity, local_authority_district, latitude, longitude) %>%
    mutate(year = year) %>%
    mutate(area = recode(local_authority_district, !!!area_recodes)) %>%
    filter(area %in% c(scot_areas, gcr_areas)) %>%
    mutate(age_group = case_when(age_of_casualty >= 16 ~ "adult",
                                 age_of_casualty >= 5 & age_of_casualty <= 15 ~ "child",
                                 TRUE ~ "<5")) %>%
    mutate(casualty_type = case_when(casualty_type == "Pedestrian" ~ "Pedestrian",
                                     casualty_type == "Cyclist" ~ "Cyclist",
                                     TRUE ~ "Other"))

  df <- df %>% bind_rows(df_to_append)
  if(year ==  years[length(years)]){
    write_rds(df, "stats19.rds")
    rm(df_to_append)}
}

df <- read_rds("stats19.rds")


#### GLASGOW SIMD ANALYSIS ####
# To look at deprivation for Glasgow we need to map each accident's lat/long onto a datazone.

# datazone shapefile
# https://spatialdata.gov.scot/geonetwork/srv/eng/catalog.search;jsessionid=7BFD03DFFDA5CD66CC065C75FBAD2172#/metadata/389787c0-697d-4824-9ca9-9ce8cb79d6f5

scot_map_2011 <- st_read("SG_DataZoneBdry_2011") %>%
  st_transform(crs="+proj=longlat +datum=WGS84") %>%
  ms_simplify(., drop_null_geometries = TRUE, keep = 1) %>%
  select(DataZone, geometry) %>%
  rename(datazone_2011 = DataZone)

# dataframe of just Glasgow's accidents
df_gla <- df %>%
  filter(area == "Glasgow") %>%
  filter(is.na(latitude) == FALSE, is.na(longitude) == FALSE)

# lat/long coords as spatial object points
gla_accident_coords <- st_as_sf(df_gla %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")

# construct vector of datazones which the points intersect. Will be one datazone per point (or none ... see next step)
dz_intersects <- st_intersects(gla_accident_coords, scot_map_2011, sparse = TRUE, prepared = TRUE)
dz_intersects <-dz_intersects %>% sapply(function(x) if (length(x) == 0) NA_integer_ else x ) # convert to vector with NAs if point not in map

# add datazone IDs into Glasgow data
df_gla<- df_gla %>%
  mutate(datazone_2011 = scot_map_2011[dz_intersects,]$datazone_2011)

# deal with accidents that don't appear in the datazone geographies; these will occur over water or be outwith Glasgow
# Will choose nearest datazone and re-insert these back in, then filter out non-Glasgow datazones
df_gla_missing <- df_gla %>% filter(is.na(datazone_2011) == TRUE)
df_gla <- df_gla %>% filter(is.na(datazone_2011) == FALSE) # remove these from main data

gla_missing_coords <- st_as_sf(df_gla_missing %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")

gla_missing_datazones <- foreach(i = 1:dim(gla_missing_coords)[1]) %dopar% {
  distances_to_dzs <- st_distance(gla_missing_coords[i,], scot_map_2011$geometry, by_element = TRUE)
  scot_map_2011[which(distances_to_dzs == min(distances_to_dzs)),]$datazone_2011
}

df_gla_missing$datazone_2011 <- gla_missing_datazones %>% unlist()

# add those that had missing datazones back in
df_gla <- df_gla %>%
  bind_rows(df_gla_missing)

gla_dz_simd <- dz_ids %>%
  mutate(area = recode(CAName, !!!area_recodes)) %>%
  filter(area == "Glasgow") %>%
  select(DataZone) %>%
  left_join(simd) %>%
  select(DataZone, SIMD2016CountryQuintile) %>%
  rename(datazone_2011 = DataZone, simd = SIMD2016CountryQuintile)

df_gla <- df_gla %>% inner_join(gla_dz_simd) # drops those that have non-Glasgow datazones

write_rds(df_gla, "stats19_gla.rds")
df_gla <- read_rds("stats19_gla.rds")


#### PLOTS ####

# Populations lookups for calculating rates per 100k pop

ca_codes <- ca_ids %>%
  mutate(area = recode(CAName, !!!area_recodes)) %>%
  filter(area %in% c(scot_areas, gcr_areas)) %>%
  filter_at(vars(contains("Archived")), ~is.na(.)) %>%
  select(CA, area)

ca_populations <- ca_pop %>%
  rename(year = Year) %>%
  filter(year %in% years) %>%
  filter(CA %in% ca_codes$CA,
         Sex == "All") %>%
  mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
         age_child = select(., Age5:Age15) %>% rowSums(),
         age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
  left_join(ca_codes) %>%
  rename(age_all = AllAges) %>%
  select(year, area, contains("age_")) %>%
  pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group")

gla_simd_populations <- dz_pop %>%
  rename(datazone_2011 = DataZone,
         year = Year) %>%
  filter(year %in% years,
         Sex == "All") %>%
  inner_join(gla_dz_simd) %>%
  mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
         age_child = select(., Age5:Age15) %>% rowSums(),
         age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
  rename(age_all = AllAges) %>%
  select(year, contains("age_"), simd) %>%
  pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group") %>%
  group_by(year, simd, age_group) %>%
  summarise(pop = sum(pop)) %>%
  mutate(area = "Glasgow")

# LA level adult / child casualties rate
df <- read_rds("stats19.rds")

df %>%
  group_by(area, year, age_group) %>%
  summarise(casualties = n()) %>%
  left_join(ca_populations) %>% View()
  mutate(rate = 1e5 * casualties / pop) %>%
  filter(age_group %in% c("adult", "child")) %>%
  ggplot(aes(x = year, y = rate, colour = area)) +
  geom_line() + geom_point() +
  facet_wrap(~age_group) +
  theme_bw()
plotly::ggplotly()

# Glasgow severity
df_gla <- read_rds("stats19_gla.rds")

df_gla %>%
  group_by(year) %>%
  summarise(casualties_all = n(), area = first(area)) %>%
  left_join(
    df_gla %>%
      filter(casualty_severity != "Slight") %>%
      group_by(year) %>%
      summarise(casualties_KSI = n()), area = first(area)) %>%
  pivot_longer(contains("casualties_"), values_to = "casualties", names_prefix = "casualties_", names_to = "type") %>%
  left_join(ca_populations %>% filter(age_group == "all")) %>%
  mutate(rate = 1e5 * casualties / pop) %>%
  ggplot(aes(x = year, y = rate, colour = type)) +
  geom_line() + geom_point() +
  theme_bw()
plotly::ggplotly()

# Glasgow by mode
df_gla <- read_rds("stats19_gla.rds")

df_gla %>%
  filter(age_group %in% c("adult", "child")) %>%
  group_by(year, casualty_type, age_group) %>%
  summarise(casualties = n(), area = first(area)) %>%
  left_join(ca_populations %>% filter(age_group %in% c("adult", "child"))) %>%
  mutate(rate = 1e5 * casualties / pop) %>%
  ggplot(aes(x = year, y = rate, colour = casualty_type)) +
  geom_line() + geom_point() +
  facet_wrap(~age_group) +
  theme_bw()
plotly::ggplotly()

# Glasgow by deprivation
df_gla %>%
  filter(age_group %in% c("adult", "child")) %>%
  group_by(year, simd, age_group) %>%
  summarise(casualties = n()) %>%
  left_join(gla_simd_populations) %>%
  mutate(rate = 1e5 * casualties / pop) %>%
  mutate(simd = as.factor(simd)) %>%
  ggplot(aes(x = year, y = rate, colour = simd)) +
  geom_line() + geom_point() +
  facet_wrap(~age_group) +
  theme_bw()
plotly::ggplotly()
	library(tidyverse)
	library(stats19)
	library(ckanr)
	# libs for mapping accident lat/long to datazone for Glasgow deprivation analysis
	library(sf)
	library(rmapshaper)
	# below libs useful for speeding up mapping accidents over water to nearest datazone
	library(foreach)
	library(doParallel)
	registerDoParallel(parallel::detectCores())

	### LOOKUPS & RECODES ####
	ckanr_setup(url = "https://www.opendata.nhs.scot/")
	res_ca_ids <- resource_show(id = "967937c4-8d67-4f39-974f-fd58c4acfda5")
	ca_ids <- ckan_fetch(x=res_ca_ids$url)
	res_dz_ids <- resource_show(id = "395476ab-0720-4740-be07-ff4467141352")
	dz_ids <- ckan_fetch(x=res_dz_ids$url)
	res_simd <- resource_show(id = "cadf715a-c365-4dcf-a6e0-acd7e3af21ec")
	simd <- ckan_fetch(x=res_simd$url)
	res_ca_pop <- resource_show(id = "09ebfefb-33f4-4f6a-8312-2d14e2b02ace")
	ca_pop <- ckan_fetch(x=res_ca_pop$url)
	res_dz_pop <- resource_show(id = "c505f490-c201-44bd-abd1-1bd7a64285ee")
	dz_pop <- ckan_fetch(x=res_dz_pop$url)

	area_recodes <- c("Glasgow City" = "Glasgow",
	"City of Edinburgh" = "Edinburgh",
	"Edinburgh, City of" = "Edinburgh",
	"Aberdeen City" = "Aberdeen",
	"Dundee City" = "Dundee",
	"Newcastle upon Tyne" = "Newcastle",
	"Bristol, City of" = "Bristol",
	"Great Britain" = "UK")

	scot_areas <- c("Scotland", "Glasgow", "Edinburgh", "Aberdeen", "Dundee")

	gcr_areas <- c("Scotland", "Glasgow", "Inverclyde", "South Lanarkshire", "North Lanarkshire", "East Dunbartonshire",
	"West Dunbartonshire", "East Renfrewshire", "Renfrewshire")

	uk_areas <- c("Edinburgh", "Leeds", "Glasgow", "Sheffield", "Newcastle", "Nottingham", "Liverpool", "Birmingham",
	"Manchester", "Bristol")
	all_areas <- unique(c(scot_areas, gcr_areas, uk_areas, "Scotland", "UK"))


	#### GET STATS19 DATA ####

	years <- 2014:2017
	# download all accident and casualty files to temp directory, to be read in and combined in next step
	# note: some years will download as a collection, e.g. downloading 1997 will download a 1979--2004 combined file
	# note: only 2014 onward has age of casualty, which we need
	# note: at time of writing (Sept 2020) 2017 is latest full year of validated data
	for(year in years){
	dl_stats19(year = year, type = "Casualties", ask = FALSE)
	dl_stats19(year = year, type = "Accidents", ask = FALSE)
	}

	# combine data from different years, format, and select incidents from our areas of interest only
	for(year in years){
	if(year == years[1]){df <- tibble()}

	df_to_append <- read_casualties(year) %>%
	left_join(read_accidents(year) %>% select(accident_index, local_authority_district, latitude, longitude)) %>%
	select(age_of_casualty, casualty_type, casualty_severity, local_authority_district, latitude, longitude) %>%
	mutate(year = year) %>%
	mutate(area = recode(local_authority_district, !!!area_recodes)) %>%
	filter(area %in% c(scot_areas, gcr_areas)) %>%
	mutate(age_group = case_when(age_of_casualty >= 16 ~ "adult",
	age_of_casualty >= 5 & age_of_casualty <= 15 ~ "child",
	TRUE ~ "<5")) %>%
	mutate(casualty_type = case_when(casualty_type == "Pedestrian" ~ "Pedestrian",
	casualty_type == "Cyclist" ~ "Cyclist",
	TRUE ~ "Other"))

	df <- df %>% bind_rows(df_to_append)
	if(year == years[length(years)]){
	write_rds(df, "stats19.rds")
	rm(df_to_append)}
	}

	df <- read_rds("stats19.rds")


	#### GLASGOW SIMD ANALYSIS ####
	# To look at deprivation for Glasgow we need to map each accident's lat/long onto a datazone.

	# datazone shapefile
	# https://spatialdata.gov.scot/geonetwork/srv/eng/catalog.search;jsessionid=7BFD03DFFDA5CD66CC065C75FBAD2172#/metadata/389787c0-697d-4824-9ca9-9ce8cb79d6f5

	scot_map_2011 <- st_read("SG_DataZoneBdry_2011") %>%
	st_transform(crs="+proj=longlat +datum=WGS84") %>%
	ms_simplify(., drop_null_geometries = TRUE, keep = 1) %>%
	select(DataZone, geometry) %>%
	rename(datazone_2011 = DataZone)

	# dataframe of just Glasgow's accidents
	df_gla <- df %>%
	filter(area == "Glasgow") %>%
	filter(is.na(latitude) == FALSE, is.na(longitude) == FALSE)

	# lat/long coords as spatial object points
	gla_accident_coords <- st_as_sf(df_gla %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")

	# construct vector of datazones which the points intersect. Will be one datazone per point (or none ... see next step)
	dz_intersects <- st_intersects(gla_accident_coords, scot_map_2011, sparse = TRUE, prepared = TRUE)
	dz_intersects <-dz_intersects %>% sapply(function(x) if (length(x) == 0) NA_integer_ else x ) # convert to vector with NAs if point not in map

	# add datazone IDs into Glasgow data
	df_gla<- df_gla %>%
	mutate(datazone_2011 = scot_map_2011[dz_intersects,]$datazone_2011)

	# deal with accidents that don't appear in the datazone geographies; these will occur over water or be outwith Glasgow
	# Will choose nearest datazone and re-insert these back in, then filter out non-Glasgow datazones
	df_gla_missing <- df_gla %>% filter(is.na(datazone_2011) == TRUE)
	df_gla <- df_gla %>% filter(is.na(datazone_2011) == FALSE) # remove these from main data

	gla_missing_coords <- st_as_sf(df_gla_missing %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84")

	gla_missing_datazones <- foreach(i = 1:dim(gla_missing_coords)[1]) %dopar% {
	distances_to_dzs <- st_distance(gla_missing_coords[i,], scot_map_2011$geometry, by_element = TRUE)
	scot_map_2011[which(distances_to_dzs == min(distances_to_dzs)),]$datazone_2011
	}

	df_gla_missing$datazone_2011 <- gla_missing_datazones %>% unlist()

	# add those that had missing datazones back in
	df_gla <- df_gla %>%
	bind_rows(df_gla_missing)

	gla_dz_simd <- dz_ids %>%
	mutate(area = recode(CAName, !!!area_recodes)) %>%
	filter(area == "Glasgow") %>%
	select(DataZone) %>%
	left_join(simd) %>%
	select(DataZone, SIMD2016CountryQuintile) %>%
	rename(datazone_2011 = DataZone, simd = SIMD2016CountryQuintile)

	df_gla <- df_gla %>% inner_join(gla_dz_simd) # drops those that have non-Glasgow datazones

	write_rds(df_gla, "stats19_gla.rds")
	df_gla <- read_rds("stats19_gla.rds")


	#### PLOTS ####

	# Populations lookups for calculating rates per 100k pop

	ca_codes <- ca_ids %>%
	mutate(area = recode(CAName, !!!area_recodes)) %>%
	filter(area %in% c(scot_areas, gcr_areas)) %>%
	filter_at(vars(contains("Archived")), ~is.na(.)) %>%
	select(CA, area)

	ca_populations <- ca_pop %>%
	rename(year = Year) %>%
	filter(year %in% years) %>%
	filter(CA %in% ca_codes$CA,
	Sex == "All") %>%
	mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
	age_child = select(., Age5:Age15) %>% rowSums(),
	age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
	left_join(ca_codes) %>%
	rename(age_all = AllAges) %>%
	select(year, area, contains("age_")) %>%
	pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group")

	gla_simd_populations <- dz_pop %>%
	rename(datazone_2011 = DataZone,
	year = Year) %>%
	filter(year %in% years,
	Sex == "All") %>%
	inner_join(gla_dz_simd) %>%
	mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(),
	age_child = select(., Age5:Age15) %>% rowSums(),
	age_adult = select(., Age16:Age90plus) %>% rowSums()) %>%
	rename(age_all = AllAges) %>%
	select(year, contains("age_"), simd) %>%
	pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group") %>%
	group_by(year, simd, age_group) %>%
	summarise(pop = sum(pop)) %>%
	mutate(area = "Glasgow")

	# LA level adult / child casualties rate
	df <- read_rds("stats19.rds")

	df %>%
	group_by(area, year, age_group) %>%
	summarise(casualties = n()) %>%
	left_join(ca_populations) %>% View()
	mutate(rate = 1e5 * casualties / pop) %>%
	filter(age_group %in% c("adult", "child")) %>%
	ggplot(aes(x = year, y = rate, colour = area)) +
	geom_line() + geom_point() +
	facet_wrap(~age_group) +
	theme_bw()
	plotly::ggplotly()

	# Glasgow severity
	df_gla <- read_rds("stats19_gla.rds")

	df_gla %>%
	group_by(year) %>%
	summarise(casualties_all = n(), area = first(area)) %>%
	left_join(
	df_gla %>%
	filter(casualty_severity != "Slight") %>%
	group_by(year) %>%
	summarise(casualties_KSI = n()), area = first(area)) %>%
	pivot_longer(contains("casualties_"), values_to = "casualties", names_prefix = "casualties_", names_to = "type") %>%
	left_join(ca_populations %>% filter(age_group == "all")) %>%
	mutate(rate = 1e5 * casualties / pop) %>%
	ggplot(aes(x = year, y = rate, colour = type)) +
	geom_line() + geom_point() +
	theme_bw()
	plotly::ggplotly()

	# Glasgow by mode
	df_gla <- read_rds("stats19_gla.rds")

	df_gla %>%
	filter(age_group %in% c("adult", "child")) %>%
	group_by(year, casualty_type, age_group) %>%
	summarise(casualties = n(), area = first(area)) %>%
	left_join(ca_populations %>% filter(age_group %in% c("adult", "child"))) %>%
	mutate(rate = 1e5 * casualties / pop) %>%
	ggplot(aes(x = year, y = rate, colour = casualty_type)) +
	geom_line() + geom_point() +
	facet_wrap(~age_group) +
	theme_bw()
	plotly::ggplotly()

	# Glasgow by deprivation
	df_gla %>%
	filter(age_group %in% c("adult", "child")) %>%
	group_by(year, simd, age_group) %>%
	summarise(casualties = n()) %>%
	left_join(gla_simd_populations) %>%
	mutate(rate = 1e5 * casualties / pop) %>%
	mutate(simd = as.factor(simd)) %>%
	ggplot(aes(x = year, y = rate, colour = simd)) +
	geom_line() + geom_point() +
	facet_wrap(~age_group) +
	theme_bw()
	plotly::ggplotly()