Skip to content

Instantly share code, notes, and snippets.

@jknowles
Last active December 4, 2018 22:46
Show Gist options
  • Save jknowles/4e744104dc9631123562c4da764ae72e to your computer and use it in GitHub Desktop.
Save jknowles/4e744104dc9631123562c4da764ae72e to your computer and use it in GitHub Desktop.
Functions to Support CPE and Census Data Alignment
################################################################################
# Functions to find the data
################################################################################
# Finders
# Simple functions that take the ID code from CPE (e.g. 49-00039) and look up
# the respective data for it in the structure provided in teh competition
find_police_shape <- function(dept_id, kaggle_kernel = FALSE) {
if(kaggle_kernel == TRUE) {
prefix = "../input/cpe-data/"
} else {
prefix = "data/"
}
# This function will be modified when placed in a production environment to
# follow the data storage/path scheme used in production. Could be modified
# to even fetch data from a DB or from a web-service
fl <- list.files(paste0(prefix, "Dept_", dept_id, "/", dept_id, "_Shapefiles"),
full.names = TRUE)
path_to_sf <- grep(".shp", fl, value = TRUE)[1]
return(path_to_sf)
}
find_police_csv <- function(dept_id, kaggle_kernel = FALSE) {
if(kaggle_kernel == TRUE) {
prefix = "../input/cpe-data/"
} else {
prefix = "data/"
}
fl <- list.files(paste0(prefix, "Dept_", dept_id),
full.names = TRUE)
# What if there is more than 1 csv?
# For now we just return first
path_to_csv <- grep(".csv", fl, value = TRUE)[1]
return(path_to_csv)
}
#############-----------------
# Functions to read in the data
###############--------------
get_county <- function(city, state) {
result <- ggmap::geocode(paste0(city, ",", state), output = "latlona", source = "google")
# rounding?
result[, 1] <- round(result[, 1], 2)
result[, 2] <- round(result[, 2], 2)
county <- latlong2county(result[, 1:2])
county <- unlist(lapply(strsplit(county, split = ","), "[", 2))
# Check for NA
# return mc for missincounty
# round to a lower resolution
mc <- which(is.na(county))
result[mc, 1] <- round(result[mc, 1], 1)
result[mc, 2] <- round(result[mc, 2], 1)
county2 <- latlong2county(result[mc, 1:2])
county2 <- unlist(lapply(strsplit(county2, split = ","), "[", 2))
county[mc] <- county2
return(county)
}
# Retrievers
# Need Google for geocoding the county
get_police_zones <- function(path_to_sf, city, state, county) {
# First we read in the shapefile
police_districts <- rgdal::readOGR(dsn = path_to_sf,
stringsAsFactors = FALSE)
# Next, we standardize all police shapefiles as they come in to use a
# projection compatible with the ACS. We do this by downloading the TIGRIS
# file for each shapefile and getting its projection string
admin_shapes <- tigris::tracts(state = state, county = county, cb = TRUE)
# Extract projection
proj_use <- CRS(paste0(admin_shapes@proj4string))
# # Currently Austin TX is missing any projection information in the shapefile
# # This function includes a workaround to project Austin to the coordinate
# # reference system used by the police x,y coordinate system in the Austin
# # data. This section could include more robust logic after analyzing additional
# # shapefiles with non-standard projections.
if (is.na(police_districts@proj4string)) {
proj4string(police_districts) <- CRS("+init=epsg:3664")
}
# Reproject the shapefile using the spTransform function
police_districts <- sp::spTransform(police_districts, proj_use)
# One funny thing is that police boundary files can extend far out into the
# water. This makes resulting maps look unfamiliar to the public. We can use
# the Census TIGRIS files to fetch the water boundaries and then slice off
# # any police boundaries that extend into the water
outline <- tigris::counties(state = state,
cb = TRUE,
resolution = "500k")
# Define the intersection
outline <- rgeos::gUnaryUnion(outline)
# Slice and return only the overlap
police_districts <- rgeos::intersect(police_districts, outline)
return(police_districts)
}
# Get the CSV police data
get_police_data <- function(path_to_pd_data) {
# Most of the CPE data files seem to have two headers
# This just removes the second header and keeps the first
# Which appears to be more standardized
all_content = readLines(path_to_pd_data, encoding = "UTF-8")
skip_second = all_content[-2]
# If the datafiles got huge it could be worth considering an alternative
# function to read the data in, but for this size read.csv works fine
dat <- read.csv(file = textConnection(skip_second, encoding = "UTF-8"),
stringsAsFactors = FALSE,
header = TRUE, fileEncoding = "UTF-8 BOM")
# CSV files can be encoded weird, some of the files here have a BOM at
# the start of the file, a UTF-8 BOM
# Without too much work we can just sub it out if it appears
if (any(grepl("ï..", names(dat)))) {
names(dat) <- gsub("ï..", replacement = "", names(dat))
}
# Handling for bad column name
if ("SUBJECT_RACT" %in% names(dat)) {
dat$SUBJECT_RACE <- dat$SUBJECT_RACT
dat$SUBJECT_RACT <- NULL
}
return(dat)
}
# Get Census data from the API
get_census_data <- function(state, county, geography = "tract",
variables = NULL) {
# We can start with a default set of Census variables, but the user
# can pass a named character vector of variables and their Census ID number
if (is.null(variables)) {
message("Using default variables")
variables <- c("total_people" = "DP05_0028",
"total_households" = "DP02_0001",
"total_family_households" = "DP02_0002",
"total_white" = "DP05_0032",
"total_black" = "DP05_0033",
"total_hispanic" = "DP05_0066",
"age_over_18" = "DP05_0018")
}
census_geo <- tidycensus::get_acs(geography = geography, variables = variables,
state = state, county = county, geometry = TRUE)
return(census_geo)
}
##################--------------------------------------------------------
## Clean and normalize geometry data
#################---------------------------------------------
# Make shape file polygon dataframe have unique rownames
makeUniform <- function(SPDF){
pref <- substitute(SPDF) #just putting the file name in front.
newSPDF <- spChFIDs(SPDF,
as.character(paste(pref,
rownames(as(SPDF,"data.frame")),
sep = "_")))
return(newSPDF)
}
# Census data bound to police polygon data
census_to_pd_bound <- function(pd_poly, census_poly, pd_poly_id = NULL,
type = c("sums", "means")) {
# Use GEOID as the default
if(is.null(pd_poly_id)) {
pd_poly_id <- names(pd_poly@data)[1]
# pd_poly_id <- "GEOID"
}
# We need to convert to a spatial frame object to take advantage of area
# function and in so doing need to preserve the coordinate systems
census_poly <- st_transform(census_poly, as.character(pd_poly@proj4string))
try({
cd_poly <- sf::as_Spatial(st_cast(census_poly, "MULTIPOLYGON"))
})
if(!exists("cd_poly")) {
cd_poly <- sf::as_Spatial(st_cast(census_poly, "POLYGON"))
}
# Unify coordinate systems
if(class(pd_poly) == "SpatialPointsDataFrame") {
pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
} else {
pd_poly <- makeUniform(pd_poly)
pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
}
# Compute the area of each tract and append it, this is needed for computing
# the proportion lying in the police boundary
cd_poly@data$tract_area <- area(cd_poly)
# Find the census vars we are gong to use
vars <- unique(cd_poly$variable)
# set up a dataframe to store our variables in a loop
out <- as.data.frame(setNames(replicate(length(vars),numeric(0), simplify = F),
vars))
# We need to loop through each polygon in the police data
# Find every polygon it intersects with in the Census data
# then we calculate the proportion of the Census polygon that lies within the PD
# polygon
# We create a weight for each Census polygon based on the proportion of its
# area in the police polygon
# Then, depending on our measure type, we take a weighted sum (counts) or a
# weighted mean (percentages, rates, average income, etc.)
for(i in unique(pd_poly@data[, pd_poly_id])){
# Slice out one polygon at a time
tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== i,]
# Workaround for column names not stored correctly
if(nrow(tmp_poly@data) == 0) {
tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== as.character(i),]
}
# Compute the intersection of the census and the selected PD polygon
zzz <- raster::intersect(cd_poly, tmp_poly)
# calculate the intersection area
zzz@data$intersect_area <- area(zzz)
# Now take the proportion of the intersecting area out of the total area
# for all the tracts that cross into the PD district
zzz@data$weight <- zzz@data$intersect_area/zzz@data$tract_area
if(type == "sums") {
out_df <- zzz@data %>% group_by(variable) %>%
mutate(weight_est = estimate * weight) %>%
# We ignore NAs here because if the Census polygon has no data, we
# can safely count it as 0
summarize(estimate = sum(weight_est, na.rm = TRUE)) %>%
mutate("id" = i) %>%
as.data.frame()
} else if(type == "means") {
out_df <- zzz@data %>% group_by(variable) %>%
summarize(estimate = weighted.mean(estimate, weight, na.rm = TRUE)) %>%
mutate("id" = i) %>%
as.data.frame()
}
out_df <- reshape(out_df, direction = "wide", timevar = "variable")
out <- rbind(out, out_df)
}
# key_by <- c(pd_poly_id = "id")
pd_poly@data <- merge(pd_poly@data, out, by.x = pd_poly_id,
by.y = "id")
return(pd_poly)
}
# Example census data variable definitions
census_count_variables <-
c("total_people" = "DP05_0028",
"total_households" = "DP02_0001",
"total_family_households" = "DP02_0002",
"total_white" = "DP05_0032",
"total_black" = "DP05_0033",
"total_hispanic" = "DP05_0066",
"age_over_18" = "DP05_0018")
census_rate_variables <-
c("percent_white" = "DP05_0032P",
"occupied_housing" = "DP04_0002P",
"health_insurance_cov" = "DP03_0096P",
"median_family_income" = "DP03_0086")
# Take the list of Census Data and police data and combine it into a new police
# boundary object with the census data
augment_census_data <- function(city_list) {
census_sf <- get_census_data(state = city_list$state, county = city_list$county,
geography = "tract", variables = census_count_variables)
zzz <- census_to_pd_bound(pd_poly = city_list$pz,
census_poly = census_sf, type = "sums")
census_sf <- get_census_data(state = city_list$state, county = city_list$county,
geography = "tract", variables = census_rate_variables)
zzb <- census_to_pd_bound(pd_poly = city_list$pz,
census_poly = census_sf, type = "means")
zzz@data <- left_join(zzz@data, zzb@data)
return(zzz)
}
# Unify police boundaries and combine police boundaries with police data
augment_pd_census <- function(city_list) {
if ("lat" %in% names(city_list$pd)) {
# drop NA
df <- city_list$pd[!is.na(city_list$pd$lat), ]
city_points <- st_as_sf(x = df,
coords = c("lon", "lat"),
crs = as.character(city_list$pz@proj4string))
out <- st_intersection(city_points, st_as_sf(city_list$pz))
} else {
experiment <- st_as_sf(city_list$pz)
experiment$LOCATION_DISTRICT <- as.character(experiment$LOCATION_DISTRICT)
city_list$pd$LOCATION_DISTRICT <- as.character(city_list$pd$LOCATION_DISTRICT)
out <- left_join(city_list$pd, experiment, by = c("LOCATION_DISTRICT"))
}
return(out)
}
#########################---------------------------------
# Cleaning data
###################_-----------------------------------------
# Functions that take a dataframe from CPE and return the dataframe with columns
# cleaned
detect_cpe_data_type <- function(df) {
# Rough and ready rules for using column logic to detect the type of
# incident data in the CPE files
# Four possiblities for now
# UOF
# UOF + INJURY
# OIS
# SEARCH+VEHICLE
# ARREST
if (any(grepl("TYPE_OF_FORCE_USED", names(df)))) {
if (any(grepl("INJURY", names(df)))) {
return("UOF+INJURY")
} else {
return("UOF")
}
} else if (any(c("SUBJECT_INJURY_TYPE",
"SUBJECT_DEATH") %in% names(df))) {
return("OIS")
}
if (any(grepl("SEARCH", names(df)))) {
return("SEARCH+VEHICLE")
} else {
return("ARREST")
}
}
# Normalize and standardize incident IDs
recode_incident_ids <- function(df) {
if("RIN" %in% names(df)) {
df$INCIDENT_UNIQUE_IDENTIFIER <- df$RIN
df$RIN <- NULL
}
# Double ID from Boston
if("INCIDENT_UNIQUE_IDENTIFIER.1" %in% names(df)) {
df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER,
df$INCIDENT_UNIQUE_IDENTIFIER.1, sep = "-")
df$INCIDENT_UNIQUE_IDENTIFIER.1 <- NULL
}
# Seattle
if("INCIDENT_UNIQUE.IDENTIFIER.1" %in% names(df)) {
df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER,
df$INCIDENT_UNIQUE.IDENTIFIER.1, sep = "-")
df$INCIDENT_UNIQUE.IDENTIFIER.1 <- NULL
}
# Deal with UOF number for dallas
if("UOF_NUMBER" %in% names(df)) {
df$INCIDENT_UNIQUE_IDENTIFIER <- df$UOF_NUMBER
df$UOF_NUMBER <- NULL
}
# Deal with faulty encoding
if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df) &
any(grepl("INCIDENT_UNIQUE_IDENTIFIER", names(df)))) {
var <- grep("INCIDENT_UNIQUE_IDENTIFIER", names(df), value = TRUE)
df$INCIDENT_UNIQUE_IDENTIFIER <- df[, var]
df[, var] <- NULL
}
if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df)) {
df$INCIDENT_UNIQUE_IDENTIFIER <- 1:nrow(df)
}
return(df)
}
# Calculate the proportion of a vector that is missing
# Use to identify the quality of latitude and longitude coordinates
prop_na <- function(x) {
D <- length(x)
N <- length(x[is.na(x)])
return(round((N/D), digits = 2))
}
# Fix Austin data to have proper coordinates
# Work with other variables to standardize on lat lon variable names
normalize_location <- function(df) {
if (any(grepl("latitude", names(df), ignore.case = TRUE))) {
# Calculate proportion missing lat/lon
# We can use this in the current set of data to identify alternative coordinate
# storage.
missing_lat <- prop_na(df[, grep("latitude", names(df), ignore.case = TRUE, value = TRUE)])
missing_long <- prop_na(df[, grep("longitude", names(df), ignore.case = TRUE, value = TRUE)])
if (any(c(missing_lat, missing_long) > 0.1) &
any(grepl("Y_COORDINATE", names(df), ignore.case = TRUE))) {
utm_coords <- df[, c("Y_COORDINATE", "Y_COORDINATE.1")]
# Convert to numeric
utm_coords <- sapply(utm_coords, as.numeric)
# Trim an obvious outlier
utm_coords <- na.omit(utm_coords[utm_coords[, 1] < 38800000, ])
# lookup table for reference system: EPSG <- make_EPSG()
# Close EPSG:3664
# Look up the refernece system
crs <- CRS("+init=epsg:3664")
# Identical EPSG:2277
# Convert to spatial points object
sputm <- SpatialPoints(coords = utm_coords[, c(1, 2)],
crs)
# conver tto spatial points
spgeo <- spTransform(sputm, CRS("+proj=longlat +datum=WGS84"))
# Rejoin
# For now let's not worry about reconciling all possible data points, we only
# keep those we can geocode which is the vast majority
df <- df[!is.na(as.numeric(df$Y_COORDINATE.1)), ]
df <- df[df[, "Y_COORDINATE"] < 38800000, ]
df[, colnames(spgeo@coords)] <- spgeo@coords
names(df)[which(names(df) %in% colnames(spgeo@coords))] <- c("lon", "lat")
df$Y_COORDINATE <- NULL
df$Y_COORDINATE.1 <- NULL
df$LOCATION_LATITUDE <- NULL
df$LOCATION_LONGITUDE <- NULL
} else {
names(df)[grep("latitude", names(df), ignore.case = TRUE)] <- "lat"
names(df)[grep("longitude", names(df), ignore.case = TRUE)] <- "lon"
}
}
return(df)
}
# Search for closely matching factor levels between two vectors of possible IDs
# This function could definitely be improved!
fuzz_finder <- function(levels, target) {
target <- as.character(target)
levels <- list(levels)
score <- 0
for(i in 1:length(levels)) {
adder <- sum(agrepl(pattern = levels[[1]][i], x = target,
max.distance = list("deletions" = 0.25, "insertions" = 0.5,
"cost" = 0.5)))
adder <- ifelse(is.na(adder), 0, adder)
score <- score + adder
}
return(score)
}
# Normalize district IDs between pz and pd by renaming the ID variable in the
# Polygon the same as in the dataset LOCATION_DISTRICT
align_location_districts <- function(city_list) {
if(!"LOCATION_DISTRICT" %in% names(city_list$pd)) {
return(city_list$pz)
} else {
pd_ids <- unique(city_list$pd$LOCATION_DISTRICT)
best_score <- 0
best_col <- 0
for (i in 1:ncol(city_list$pz@data)) {
score <- fuzz_finder(pd_ids, city_list$pz@data[,i])
# score <- sum(pd_ids %in% city_list$pz@data[, i])
best_col <- ifelse(score > best_score, i, best_col)
best_score <- ifelse(score > best_score, score, best_score)
}
city_list$pz@data$LOCATION_DISTRICT <- city_list$pz@data[, best_col]
return(city_list$pz)
}
}
# Standardize USE OF FORCE LEVELS in CPE data
force_6_lev <- function(x) {
# Function adapted from
# https://www.kaggle.com/fkosmowski/part-1-standardizing-use-of-force-datasets
for(i in 1:ncol(x)) {
var <- as.character(x[, i])
var[str_detect(var, paste(c('Level 1', 'level1','LEVEL 1', "LEVEL1", "Display", 'display'),
collapse = '|'))] <- "Officer presence (L1)"
var[str_detect(var, paste(c('Level 2', 'level2','LEVEL 2', "LEVEL2", "Verbal", "VERBAL"),
collapse = '|'))] <- "Verbal commands (L2)"
var[str_detect(var, paste(c('Level 3', 'level3','LEVEL 3', "LEVEL3",
"Pressure", "PRESSURE", 'Weight','WEiGHT', 'LVNR',
'Handcuffing', 'Hand', 'HAND','Down', 'DOWN',
'JOiNT', 'Joint Locks', 'ForceGeneral',
'BodilyForceType', 'BD', 'Combat',
'COMBAT', 'PiT', 'Leg', 'LEG'), collapse = '|'))] <- "Control Tactics (L3)"
var[str_detect(var, paste(c('Level 4', 'level4','LEVEL 4',
"LEVEL4", "Strike", 'Kick', "Physical",
'Chemirritant', "OC", "SPRAY", 'CHEMiCAL',
'pepper', 'Pepper', 'PEPPER',
'Projectile', 'ChemIrritant'), collapse = '|'))] <- "Hard control tactics (L4)"
var[str_detect(var, paste(c('Level 5', 'level5','LEVEL 5', "LEVEL5",
"Less Lethal", 'LESS LETHAL', 'Baton',
'BATON', "BatonForce", 'BATONFORCE',
"Taser", "TASER", 'CED',
'Conducted Energy Device', "Canine",
'CANiNE', 'K9', 'K-9', 'Weapon', 'WEAPON'), collapse = '|'))] <- "Weapons (L5)"
var[str_detect(var, paste(c('Level 6', 'level6','LEVEL 6', "LEVEL6",
"Vehicle", 'FirearmType', 'Shotgun',
'SHOTGUN','FiREARM', 'Firearm',
'SHOTS', 'Shots'), collapse = '|'))] <- "Lethal force (L6)"
var[str_detect(var, paste(c('-', 'Other','OTHER'), collapse = '|'))] <- "Other / missing"
x[, i] <- as.factor(var)
x[, i] <- droplevels(x[, i])
}
if(ncol(x) == 1) {
return(as.factor(x[, 1]))
} else {
# Topcode use of force by most force used
x$allvar <- apply(x , 1 , paste , collapse = "-" )
out <- data.frame("TYPE_OF_FORCE" = sapply(x$allvar, topcode_force_level),
stringsAsFactors = FALSE)
out <- as.factor(out[,1])
return(out)
}
}
# For cases where we have multiple uses of force for one incident, let's take
# the most severe
topcode_force_level <- function(x) {
if (grepl("L6", x)) {
return("Lethal force (L6)")
} else if (grepl("L5", x)) {
return("Weapons (L5)")
} else if (grepl("L4", x)) {
return("Hard control tactics (L4)")
} else if (grepl("L3", x)) {
return("Control Tactics (L3)")
} else if (grepl("L2", x)) {
return("Verbal commands (L2)")
} else if (grepl("L1", x)) {
return("Officer presence (L1)")
} else {
return("None")
}
}
# Recode dataframe and return correct object types
force_recode <- function(df) {
force_vars <- grep("FORCE_USED", names(df), ignore.case = TRUE, value = TRUE)
if (is.null(force_vars) | length(force_vars) == 0) {
return(df)
} else {
df$"TYPE_OF_FORCE" <- force_6_lev(df[, force_vars, drop = FALSE])
return(df)
}
}
#Recode any variable with _RACE
recode_race <- function(df) {
race_vars <- grep("_RACE", names(df), value = TRUE)
if(length(race_vars) > 0) {
for(i in race_vars) {
df[, i] <- as.character(df[, i])
df[, i][str_detect(df[, i], "Asian")] <- "Asian"
df[, i][str_detect (df[, i], "Black")] <- "Black"
df[, i][str_detect(df[, i], paste(c("Hispanic", "Latino"), collapse = '|'))] <- "Hispanic"
df[, i][str_detect(df[, i], "White")] <- "White"
df[, i][str_detect(df[, i], paste(c("Other","Native", "Ind", "Pac"), collapse = '|'))] <- "Other"
df[, i][str_detect(df[, i], paste(c("Unk", "UNK",
"DATA", "Data", "NULL", "Not", "specified",
"recorded"), collapse = '|'))] <- "Unknown"
# Letter-based, for remaining categories
df[, i][df[, i]=='A'] <- "Asian"
df[, i][df[, i]=='B'] <- "Black"
df[, i][df[, i]=='H'] <- "Hispanic"
df[, i][df[, i]=='W'] <- "White"
'%ni%' <- Negate('%in%');
df[, i][df[, i] %ni%
c('Asian', 'Black', 'Hispanic', 'White') &
!is.na(df[, i]) ] <- "Other"
}
return(df)
} else {
return(df)
}
}
# Recode times
code_incident_time <- function(df) {
if(!"INCIDENT_TIME" %in% names(df)) {
return(df)
} else {
# TODO: Automatically find timezone
# Handling for cases wehre the date is recorded in the time - Boston
if(any(grep("/", df$INCIDENT_TIME))) {
tmp_time <- df$INCIDENT_TIME
} else {
tmp_time <- paste(df$INCIDENT_DATE, df$INCIDENT_TIME)
}
df$INCIDENT_TIME <-
parse_date_time(tmp_time,
orders = c("Ymd HMS", "Ymd H!M!", "Ymd HM", "Ymd HMS p",
"mdy HM"))
return(df)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment