Skip to content

Instantly share code, notes, and snippets.

@natlownes
Last active March 30, 2023 17:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save natlownes/34a67777c9756b95efe40ffa33139542 to your computer and use it in GitHub Desktop.
Save natlownes/34a67777c9756b95efe40ffa33139542 to your computer and use it in GitHub Desktop.
library(arrow)
library(daoviz)
library(dplyr)
library(ggplot2)
library(gt)
library(haven)
library(lubridate)
library(purrr)
library(snakecase)
library(tidyverse)
library(zeallot)
################################################################################
# notes on the data from the Inquirer stories
################################################################################
#
#
# from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18,
# 1973:
#
# The investigation and computer study centered on the cases of 1034 persons
# who were indicted during 1971 for at least one of the four major crimes of
# violence -- murder, rape, aggravated robbery and aggravated assault and
# battery
#
# Using information culled from more than 10,000 court documents and 20,000
# pages of transcripts of court proceedings, Inquirer reporters traced from
# beginning to end -- from commission of the crime to trial and sentencing --
# the cases of the 1,034 defendants
#
#
# from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973:
#
# The usual way that prosecutors and the courts maintain statistics --
# computing convictions and acquittals -- is by the individual defendant, but
# the Inquirer survey measured the outcome of cases from the viewpoint of the
# victim.
#
# Although there were 1,034 defendants in the study, the number of cases in
# the computer analysis totaled 1,374. The bulk of the difference represents
# those person who were indicted for two or more separate, major crimes
# against different victims.
#
# The cases of the 1,034 defendants represent a 39 percent sampling of all
# persons indicted in 1971 for one of the four violent crimes.
# load spss files
inquirer_data_path <- daocore::get_data_path(
'1973_inquirer_series/knight_s609_spss.por')
inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE,
.name_repair = 'universal')
column_names <- names(inky_df_src)
# var labels are a descriptive name for the column
var_labels <- labelled::var_label(inky_df_src)
# val labels are like an enum datatype in a db
val_labels <- labelled::val_labels(inky_df_src)
# there are some fields in the data that are missing val labels but have what
# can be parsed as val labels in the field name. examples:
# "SEX OF VICTIM: 1-MALE,2-FEMALE"
# "RACE:1-WHITE,2-BLACK,3-OTHER"
might_be_enum_field = ".+(\\:|\\?)+(\\s)*1"
# for columns that don't have val labels, try and parse out what should be val
# labels from the column name. These columns follow the format of the `might_be_enum_field` regex above
parse_val_labels_from_field_name <- function(s){
parts <- stringr::str_split(s, '\\:|\\?')
# toss out the field name, keep the enum-ish string
vals <- purrr::pluck(parts, 1, 2)
if(is.null(vals)){
return(vals)
}
items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim)
reduce(items, function(acc, v){
sp <- as.list(strsplit(v, '-')[[1]])
acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1))
acc
}, .init=c())
}
# the inky dataset columns are two digit year
inky_dataset_date_format <- function(y,m,d){
lubridate::make_date(y + 1900, m, d)
}
# makes a two digit year into a four digit year depending on the specified
# cutoff. for example the year born column is two digits, if the two digit year
# is gt cutoff 70, assume 1800
inky_dataset_year <- function(y, cutoff=70) {
dplyr::if_else(y > cutoff, y + 1800, y + 1900)
}
# this augments the set of labels that came with the dataset. some columns have
# val labels we can use, but some that should have them are missing them. for
# some of these columns we can use the value of the var_label for the column to
# come up with the missing val labels. (defendant race is an example in the
# dataset)
labels <- lapply(column_names, function(l){
key <- as.character(l)
val <- purrr::pluck(var_labels, key, .default=l)
label <- purrr::pluck(val_labels, key)
if(!is.null(label)){
return(label)
}
if(grepl(might_be_enum_field, val)){
return(parse_val_labels_from_field_name(val))
}
})
# make our labels list a named list using the column names (`V${integer}`)
names(labels) <- column_names
labelled::val_labels(inky_df_src) <- labels
# some column names reference other column names using the "V${integer} style;
# this function replaces any of those appearances with the readable column name
# example: "DISPOSITION V94, V104, INTENT TO MURDER"
# note that a returned referenced column may also have references of its own
dereference_column_name <- function(col_name){
refs <- unlist(stringr::str_match_all(col_name, 'V\\d+'))
if(purrr::is_empty(refs)){
return(col_name)
}
lapply(refs, function(ref){
val <- dereference_column_name(purrr::pluck(var_labels, ref, .default=''))
col_name <<- gsub(ref, val, col_name)
})
# all the referenced cols are disposition columns, remove only instances of
# that word that have whitespace before it-- these occur within the concat'd
# referenced column names
return(gsub(' DISPOSITION', '', col_name))
}
# the codebook pdf says that in addition to values 0 thru 9 as defined here,
# there's also life sentence "++" but it doesn't seem like that exists in the
# dataset
sentence_enum_to_day_range <- list(
# sentence suspended
list(ddays(0), ddays(0)),
# Fines and costs
list(ddays(0), ddays(0)),
list(ddays(1), dmonths(3)),
list(dmonths(4), dmonths(6)),
list(dmonths(7), dmonths(12)),
list(dmonths(13), dyears(2)),
list(dyears(2) + ddays(1), dyears(5)),
list(dyears(5) + ddays(1), dyears(10)),
list(dyears(10) + ddays(1), dyears(20)),
list(dyears(20) + ddays(1), dyears(Inf)),
# life
list(dyears(Inf), dyears(Inf))
)
sentence_min_bounds <- unlist(
map(sentence_enum_to_day_range, function(x){x[1]}))
sentence_max_bounds <- unlist(
map(sentence_enum_to_day_range, function(x){x[2]}))
# use a snake-cased version of the columns var labels so the fields are a little
# more readable
formatted_col_names <- lapply(column_names, function(e){
val <- purrr::pluck(var_labels, as.character(e), .default=e)
val <- dereference_column_name(val)
# special case to handle the unlabeled columns which we'll ignore
if(val==''){
return(e)
}
snakecase::to_snake_case(val)
})
names(formatted_col_names) <- column_names
snake_case_columns <- function(v){
purrr::map_chr(v, function(s) {
purrr::pluck(formatted_col_names, s)
})
}
# some column names are abbreviated, expand those abbreviations in this function
standard_column_names <- function(v){
gsub('aggr_a_b', 'aggr_assault_and_battery', v)
}
# wrap up all our column renamings into one function
inky_rename_columns <- function(df) {
df |>
dplyr::rename_with(snake_case_columns) |>
dplyr::rename(all_of(renamings)) |>
dplyr::rename_with(standard_column_names)
}
# consolidates the set of date fields that exist as three columns and follow a
# naming pattern into their own date columns and drops the three source columns
datepart_fields_mutations <- function(df) {
event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment',
'trial', 'sentence')
datepart_names <- c('year_of', 'month_of', 'day_of')
# new columns we're creating
new_field_names <- map(event_names, function(e){
sym(paste0(e, '_date'))
})
drop_columns <- unname(unlist(map(event_names, function(e){
lapply(datepart_names, function(d){
paste0(d, '_', e)
})
})))
operations <- map(event_names, function(e){
args <- map(datepart_names, function(d){
var_name <- paste0(d, "_", e)
quo(purrr::pluck(!!sym(var_name), 1))
})
quo(inky_dataset_date_format(!!!args))
})
args <- setNames(
operations,
new_field_names
)
df |>
dplyr::mutate(!!!args) |>
dplyr::select(-all_of(drop_columns))
}
# a list of manual column renamings
renamings <- c(
"race_defendant" = "race_1_white_2_black_3_other",
"prior_arrests" = "prior_arrests_1_yes_2_no",
"transcript_available" = "transcript_1_yes_2_no_3_copied",
"race_judge" = "race_of_judge_1_black_2_white",
"jury_trial" = "jury_trial_1_yes_2_no",
"had_transcript" = "was_there_a_transcript_1_yes_2_no",
"party_of_judge" = "party_of_judge_1_dem_2_rep",
"was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no",
"race_of_victim" = "race_of_victim_1_white_2_black_3_other",
"data_compiled_by" = "data_compiled_by_1_barlett_2_steele",
"sex_of_victim" = "sex_of_victim_1_male_2_female",
"sex_of_defendant" = "sex_of_defendant_1_male_2_female"
)
# do a buncha transforms to consolidate columns
inky_df <- inky_df_src |>
inky_rename_columns() |>
# turns all the spss integers into their factor values
haven::as_factor() |>
# now that we've got some nicer column names and values to work with, add some
# nicer data types
dplyr::mutate(
year_born = inky_dataset_year(year_born)
) |>
dplyr::rowwise() |>
datepart_fields_mutations() |>
# incident date doesn't fit the pattern of the other fields as defined in
# datepart_fields_mutations() so do that one manually
dplyr::mutate(
incident_date = inky_dataset_date_format(
year_of_incident_last_2_digits[1],
month_of_incident_1_thru_12[1],
day_of_incident_1_thru_31[1])
) |>
# drop these columns that were condensed into a single date column
dplyr::select(-all_of(c(
'year_of_incident_last_2_digits',
'month_of_incident_1_thru_12',
'day_of_incident_1_thru_31'
))) |>
# drop these two columns which are unnamed and null
dplyr::select(-all_of(c(
'V67',
'V138'
)))
col_names <- names(inky_df)
disposition_cols <- names(inky_df[grepl('^disposition', col_names)])
probation_cols <- names(inky_df[grepl('^probation', col_names)])
min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)])
max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)])
multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)])
sentencing_columns <- c(
min_sentence_cols,
max_sentence_cols,
multi_sentence_cols)
# for debugging-- collect all the names of columns that have a value, summarize
# as one text field
summary_column <- function(pattern, columns) {
return(function(v){
s <- stringr::str_sort(columns[!is.na(v)])
paste0(gsub(pattern, '', s), collapse='||')
})
}
summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary')
inky_df <- inky_df |>
rowwise() |>
# for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row
mutate(
charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))),
charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))),
charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))),
charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))),
charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols)))
)
# create a dispositions table with three columns: unique_case_id, charge,
# disposition_outcome
# example row:
# unique_case_id | charge | disposition_outcome
# ---------------|--------------------|--------------------
# 7466801 | murder | FND GLTY-LSSR OFFNS
dispositions <- inky_df[c('unique_case_id', disposition_cols)] |>
pivot_longer(
cols = all_of(disposition_cols),
names_to = c("charge"),
values_to = c("disposition_outcome"),
names_transform = list(
charge = function(v){
purrr::map_chr(v, function(c){
gsub('disposition_', '', c)
})
}
)
) |>
filter(!is.na(disposition_outcome)) |>
dplyr::distinct()
# create a probations table with three columns: unique_case_id, charge,
# probation
# example table:
# unique_case_id | charge | probation
# ---------------|--------------------|--------------------
# 7466801 | murder | YES-NO TIME
# 7466802 | murder | YES-WITH TIME
#
# YES-NO TIME: yes, with no time in jail
# YES-WITH TIME: yes, after time served
# a YES-WITH TIME value can indicate that jail time was sentenced for another
# charge in the same case. you can see these when joining on the sentences table
#
# a note about probations, from the codebook pdf included with the dataset:
#
# treat missing data cautiously! For example, a code of 99 (na) on the
# probation items may refer to 'not applicable', 'no probation', or 'missing
# information'. Not applicables can, with a bit of effort, be manipulated into
# a separate category. There is however, no viable means of separating the
# rest of the 99's into one category consisting of 'no probation' and another
# consisting of 'not ascertained' (which may include individuals who did or
# did no receive probation). In other words, do not assume that everyone in
# the residual category did not receive a probationary sentence
#
# with that caveat, this table is probably not useful for any aggregate analysis
probations <- inky_df[c('unique_case_id', probation_cols)] |>
pivot_longer(
cols = all_of(probation_cols),
names_to = c("charge"),
values_to = c("probation"),
names_transform = list(
charge = function(v){
purrr::map_chr(v, function(c){
gsub('probation_', '', c)
})
}
)
) |>
dplyr::filter(!is.na(probation)) |>
dplyr::distinct()
# CPI index value, Dec 2022
# https://fred.stlouisfed.org/release/tables?rid=10&eid=36163#
# https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation
# default values are current: Dec 2022, past: Dec 1971
inflation_adjustment <- function(v,
current_idx_value=294.883,
past_index_value=43.300) {
v * (current_idx_value / past_index_value)
}
bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] |>
dplyr::rename(
'amount'='bail_in_wob_999999_nominal_000001'
) |>
# not sure what a null amount means, so remove
dplyr::filter(!is.na(amount)) |>
mutate (
# value is actually 99999, note 999999 as noted in original column label
held_without_bail = if_else(amount == 99999, TRUE, FALSE),
real_amount = if_else(amount == 99999,
Inf,
inflation_adjustment(amount))
) |>
dplyr::distinct()
multi_sentences <- inky_df_src |>
inky_rename_columns() |>
mutate(
across(
all_of(multi_sentence_cols),
haven::as_factor
)
)
# a table describing concurrent sentencing (if any) by charge and unique_case_id
# example table:
# unique_case_id | charge | concurrency
# ---------------+---------------------+---------------------
# 26792592 | assault_and_battery | YES-CONCURRENT TERMS
# 26792592 | aggravated_robbery | NO MULT SENT
# 36052400 | attempted_larceny | YES-BOTH
multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] |>
pivot_longer(
cols = all_of(multi_sentence_cols),
names_to = c("charge"),
values_to = c("concurrency"),
names_transform = list(
charge = function(v){
purrr::map_chr(v, function(c){
gsub('multiple_sentence_', '', c)
})
}
)
) |>
filter(!is.na(concurrency)) |>
dplyr::distinct()
# transforms for a sentences table. in the wide version of the table, create
# columns for the lower and upper bound (in seconds) of the sentencing and add a
# description column by converting the wide column values into factors
sentences_src <- inky_df_src |>
inky_rename_columns() |>
mutate(
across(
all_of(c(min_sentence_cols, max_sentence_cols)),
.fns = list(
lower_bound = function(v){
sentence_min_bounds[v]
},
upper_bound = function(v){
sentence_max_bounds[v]
},
description = haven::as_factor
),
.names = "{.col}||{.fn}"
)
) |>
# drop the columns that have been converted into bounds and descriptions
dplyr::select(-all_of(c(
min_sentence_cols,
max_sentence_cols
))) |>
# rename the columns to a format where values are separated by '||'
dplyr::rename_with(function(v){
purrr::map_chr(v, function(s) {
if(grepl('^minimum|^maximum', s)){
return(sub('sentence_', 'sentence||', s))
}
s
})
})
sentence_details_cols <- names(sentences_src[
grepl('minimum_sentence|maximum_sentence', names(sentences_src))])
# sentences spec for pivot_longer
# .name | .value | charge | sentence_type
# example row:
# minimum_sentence||murder||lower_bound | lower_bound | murder | minimum_sentence
sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){
c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\|\\|')[[1]])
c(.name=e, .value=value, charge=charge, sentence_type=sentence_type)
})
sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows))
# create our sentences table:
# unique_case_id, sentence type, charge, lower_bound, upper_bound, description
sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] |>
tidyr::pivot_longer_spec(sentences_spec) |>
# in our lengthening, we've created a lot of what are null rows so drop any
# rows where description is na
dplyr::filter(!is.na(description)) |>
dplyr::distinct()
# create a cases_inky table that's a little easier to digest: minus all the columns
# that are now lengthened in different tables.
# note the the unique_case_id won't be unique in this table-- there can be
# multiple rows if there were multiple charges
cases_inky <- inky_df |>
dplyr::select(-all_of(c(
sentencing_columns,
disposition_cols,
probation_cols
)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment