natlownes/1973-crime-and-injustice-dataset.R

## 1973-crime-and-injustice-dataset.R
library(arrow)
library(daoviz)
library(dplyr)
library(ggplot2)
library(gt)
library(haven)
library(lubridate)
library(purrr)
library(snakecase)
library(tidyverse)
library(zeallot)

################################################################################
# notes on the data from the Inquirer stories
################################################################################
#
#
# from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18,
# 1973:
#
#   The investigation and computer study centered on the cases of 1034 persons
#   who were indicted during 1971 for at least one of the four major crimes of
#   violence -- murder, rape, aggravated robbery and aggravated assault and
#   battery
#
#   Using information culled from more than 10,000 court documents and 20,000
#   pages of transcripts of court proceedings, Inquirer reporters traced from
#   beginning to end -- from commission of the crime to trial and sentencing --
#   the cases of the 1,034 defendants
#
#
# from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973:
#
#   The usual way that prosecutors and the courts maintain statistics --
#   computing convictions and acquittals -- is by the individual defendant, but
#   the Inquirer survey measured the outcome of cases from the viewpoint of the
#   victim.
#
#   Although there were 1,034 defendants in the study, the number of cases in
#   the computer analysis totaled 1,374. The bulk of the difference represents
#   those person who were indicted for two or more separate, major crimes
#   against different victims.
#
#   The cases of the 1,034 defendants represent a 39 percent sampling of all
#   persons indicted in 1971 for one of the four violent crimes.


# load spss files
inquirer_data_path <- daocore::get_data_path(
  '1973_inquirer_series/knight_s609_spss.por')

inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE,
                                .name_repair = 'universal')

column_names <- names(inky_df_src)
# var labels are a descriptive name for the column
var_labels <- labelled::var_label(inky_df_src)
# val labels are like an enum datatype in a db
val_labels <- labelled::val_labels(inky_df_src)

# there are some fields in the data that are missing val labels but have what
# can be parsed as val labels in the field name. examples:
# "SEX OF VICTIM: 1-MALE,2-FEMALE"
# "RACE:1-WHITE,2-BLACK,3-OTHER"
might_be_enum_field = ".+(\\:|\\?)+(\\s)*1"

# for columns that don't have val labels, try and parse out what should be val
# labels from the column name. These columns follow the format of the `might_be_enum_field` regex above
parse_val_labels_from_field_name <- function(s){
  parts <- stringr::str_split(s, '\\:|\\?')
  # toss out the field name, keep the enum-ish string
  vals <- purrr::pluck(parts, 1, 2)
  if(is.null(vals)){
    return(vals)
  }
  items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim)
  reduce(items, function(acc, v){
    sp <- as.list(strsplit(v, '-')[[1]])
    acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1))
    acc
  }, .init=c())
}

# the inky dataset columns are two digit year
inky_dataset_date_format <- function(y,m,d){
  lubridate::make_date(y + 1900, m, d)
}

# makes a two digit year into a four digit year depending on the specified
# cutoff. for example the year born column is two digits, if the two digit year
# is gt cutoff 70, assume 1800
inky_dataset_year <- function(y, cutoff=70) {
  dplyr::if_else(y > cutoff, y + 1800, y + 1900)
}

# this augments the set of labels that came with the dataset. some columns have
# val labels we can use, but some that should have them are missing them. for
# some of these columns we can use the value of the var_label for the column to
# come up with the missing val labels. (defendant race is an example in the
# dataset)
labels <- lapply(column_names, function(l){
  key <- as.character(l)
  val <- purrr::pluck(var_labels, key, .default=l)
  label <- purrr::pluck(val_labels, key)
  if(!is.null(label)){
    return(label)
  }
  if(grepl(might_be_enum_field, val)){
    return(parse_val_labels_from_field_name(val))
  }
})

# make our labels list a named list using the column names (`V${integer}`)
names(labels) <- column_names
labelled::val_labels(inky_df_src) <- labels

# some column names reference other column names using the "V${integer} style;
# this function replaces any of those appearances with the readable column name
# example: "DISPOSITION V94, V104, INTENT TO MURDER"
# note that a returned referenced column may also have references of its own
dereference_column_name <- function(col_name){
  refs <- unlist(stringr::str_match_all(col_name, 'V\\d+'))
  if(purrr::is_empty(refs)){
    return(col_name)
  }
  lapply(refs, function(ref){
    val <- dereference_column_name(purrr::pluck(var_labels, ref, .default=''))
    col_name <<- gsub(ref, val, col_name)
  })
  # all the referenced cols are disposition columns, remove only instances of
  # that word that have whitespace before it-- these occur within the concat'd
  # referenced column names
  return(gsub(' DISPOSITION', '', col_name))
}


# the codebook pdf says that in addition to values 0 thru 9 as defined here,
# there's also life sentence "++" but it doesn't seem like that exists in the
# dataset
sentence_enum_to_day_range <- list(
  # sentence suspended
  list(ddays(0), ddays(0)),
  # Fines and costs
  list(ddays(0), ddays(0)),
  list(ddays(1), dmonths(3)),
  list(dmonths(4), dmonths(6)),
  list(dmonths(7), dmonths(12)),
  list(dmonths(13), dyears(2)),
  list(dyears(2) + ddays(1), dyears(5)),
  list(dyears(5) + ddays(1), dyears(10)),
  list(dyears(10) + ddays(1), dyears(20)),
  list(dyears(20) + ddays(1), dyears(Inf)),
  # life
  list(dyears(Inf), dyears(Inf))
)

sentence_min_bounds <- unlist(
  map(sentence_enum_to_day_range, function(x){x[1]}))
sentence_max_bounds <- unlist(
  map(sentence_enum_to_day_range, function(x){x[2]}))

# use a snake-cased version of the columns var labels so the fields are a little
# more readable
formatted_col_names <- lapply(column_names, function(e){
  val <- purrr::pluck(var_labels, as.character(e), .default=e)
  val <- dereference_column_name(val)
  # special case to handle the unlabeled columns which we'll ignore
  if(val==''){
    return(e)
  }
  snakecase::to_snake_case(val)
})

names(formatted_col_names) <- column_names

snake_case_columns <- function(v){
  purrr::map_chr(v, function(s) {
    purrr::pluck(formatted_col_names, s)
  })
}

# some column names are abbreviated, expand those abbreviations in this function
standard_column_names <- function(v){
  gsub('aggr_a_b', 'aggr_assault_and_battery', v)
}

# wrap up all our column renamings into one function
inky_rename_columns <- function(df) {
  df |>
    dplyr::rename_with(snake_case_columns) |>
    dplyr::rename(all_of(renamings)) |>
    dplyr::rename_with(standard_column_names)
}

# consolidates the set of date fields that exist as three columns and follow a
# naming pattern into their own date columns and drops the three source columns
datepart_fields_mutations <- function(df) {
  event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment',
                   'trial', 'sentence')
  datepart_names <- c('year_of', 'month_of', 'day_of')
  # new columns we're creating
  new_field_names <- map(event_names, function(e){
    sym(paste0(e, '_date'))
  })

  drop_columns <- unname(unlist(map(event_names, function(e){
    lapply(datepart_names, function(d){
      paste0(d, '_', e)
    })
  })))

  operations <- map(event_names, function(e){
    args <- map(datepart_names, function(d){
      var_name <- paste0(d, "_", e)
      quo(purrr::pluck(!!sym(var_name), 1))
    })
    quo(inky_dataset_date_format(!!!args))
  })
  args <- setNames(
    operations,
    new_field_names
  )
  df |>
    dplyr::mutate(!!!args) |>
    dplyr::select(-all_of(drop_columns))
}

# a list of manual column renamings
renamings <- c(
  "race_defendant" = "race_1_white_2_black_3_other",
  "prior_arrests" = "prior_arrests_1_yes_2_no",
  "transcript_available" = "transcript_1_yes_2_no_3_copied",
  "race_judge" = "race_of_judge_1_black_2_white",
  "jury_trial" = "jury_trial_1_yes_2_no",
  "had_transcript" = "was_there_a_transcript_1_yes_2_no",
  "party_of_judge" = "party_of_judge_1_dem_2_rep",
  "was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no",
  "race_of_victim" = "race_of_victim_1_white_2_black_3_other",
  "data_compiled_by" = "data_compiled_by_1_barlett_2_steele",
  "sex_of_victim" = "sex_of_victim_1_male_2_female",
  "sex_of_defendant" = "sex_of_defendant_1_male_2_female"
)

# do a buncha transforms to consolidate columns
inky_df <- inky_df_src |>
  inky_rename_columns() |>
  # turns all the spss integers into their factor values
  haven::as_factor() |>
  # now that we've got some nicer column names and values to work with, add some
  # nicer data types
  dplyr::mutate(
    year_born = inky_dataset_year(year_born)
  ) |>
  dplyr::rowwise() |>
  datepart_fields_mutations() |>
  # incident date doesn't fit the pattern of the other fields as defined in
  # datepart_fields_mutations() so do that one manually
  dplyr::mutate(
    incident_date = inky_dataset_date_format(
      year_of_incident_last_2_digits[1],
      month_of_incident_1_thru_12[1],
      day_of_incident_1_thru_31[1])
  ) |>
  # drop these columns that were condensed into a single date column
  dplyr::select(-all_of(c(
    'year_of_incident_last_2_digits',
    'month_of_incident_1_thru_12',
    'day_of_incident_1_thru_31'
  ))) |>
  # drop these two columns which are unnamed and null
  dplyr::select(-all_of(c(
    'V67',
    'V138'
  )))

col_names <- names(inky_df)
disposition_cols <- names(inky_df[grepl('^disposition', col_names)])
probation_cols <- names(inky_df[grepl('^probation', col_names)])
min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)])
max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)])
multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)])

sentencing_columns <- c(
  min_sentence_cols,
  max_sentence_cols,
  multi_sentence_cols)

# for debugging-- collect all the names of columns that have a value, summarize
# as one text field
summary_column <- function(pattern, columns) {
  return(function(v){
    s <- stringr::str_sort(columns[!is.na(v)])
    paste0(gsub(pattern, '', s), collapse='||')
  })
}

summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary')

inky_df <- inky_df |>
  rowwise() |>
  # for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row
  mutate(
    charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))),
    charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))),
    charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))),
    charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))),
    charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols)))
  )


# create a dispositions table with three columns: unique_case_id, charge,
# disposition_outcome
# example row:
# unique_case_id | charge             | disposition_outcome
# ---------------|--------------------|--------------------
# 7466801        | murder             | FND GLTY-LSSR OFFNS
dispositions <- inky_df[c('unique_case_id', disposition_cols)] |>
  pivot_longer(
    cols = all_of(disposition_cols),
    names_to = c("charge"),
    values_to = c("disposition_outcome"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('disposition_', '', c)
        })
      }
    )
  ) |>
  filter(!is.na(disposition_outcome)) |>
  dplyr::distinct()

# create a probations table with three columns: unique_case_id, charge,
# probation
# example table:
# unique_case_id | charge             | probation
# ---------------|--------------------|--------------------
# 7466801        | murder             | YES-NO TIME
# 7466802        | murder             | YES-WITH TIME
#
# YES-NO TIME: yes, with no time in jail
# YES-WITH TIME: yes, after time served
# a YES-WITH TIME value can indicate that jail time was sentenced for another
# charge in the same case. you can see these when joining on the sentences table
#
# a note about probations, from the codebook pdf included with the dataset:
#
#   treat missing data cautiously! For example, a code of 99 (na) on the
#   probation items may refer to 'not applicable', 'no probation', or 'missing
#   information'. Not applicables can, with a bit of effort, be manipulated into
#   a separate category. There is however, no viable means of separating the
#   rest of the 99's into one category consisting of 'no probation' and another
#   consisting of 'not ascertained' (which may include individuals who did or
#   did no receive probation). In other words, do not assume that everyone in
#   the residual category did not receive a probationary sentence
#
# with that caveat, this table is probably not useful for any aggregate analysis
probations <- inky_df[c('unique_case_id', probation_cols)] |>
  pivot_longer(
    cols = all_of(probation_cols),
    names_to = c("charge"),
    values_to = c("probation"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('probation_', '', c)
        })
      }
    )
  ) |>
  dplyr::filter(!is.na(probation)) |>
  dplyr::distinct()

# CPI index value, Dec 2022
# https://fred.stlouisfed.org/release/tables?rid=10&eid=36163#
# https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation
# default values are current: Dec 2022, past: Dec 1971
inflation_adjustment <- function(v,
                                 current_idx_value=294.883,
                                 past_index_value=43.300) {
  v * (current_idx_value / past_index_value)
}

bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] |>
  dplyr::rename(
    'amount'='bail_in_wob_999999_nominal_000001'
  ) |>
  # not sure what a null amount means, so remove
  dplyr::filter(!is.na(amount)) |>
  mutate (
    # value is actually 99999, note 999999 as noted in original column label
    held_without_bail = if_else(amount == 99999, TRUE, FALSE),
    real_amount = if_else(amount == 99999,
                          Inf,
                          inflation_adjustment(amount))
  ) |>
  dplyr::distinct()

multi_sentences <- inky_df_src |>
  inky_rename_columns() |>
  mutate(
    across(
      all_of(multi_sentence_cols),
      haven::as_factor
    )
  )

# a table describing concurrent sentencing (if any) by charge and unique_case_id
# example table:
# unique_case_id | charge              | concurrency
# ---------------+---------------------+---------------------
# 26792592       | assault_and_battery | YES-CONCURRENT TERMS
# 26792592       | aggravated_robbery  | NO MULT SENT
# 36052400       | attempted_larceny   | YES-BOTH
multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] |>
  pivot_longer(
    cols = all_of(multi_sentence_cols),
    names_to = c("charge"),
    values_to = c("concurrency"),
    names_transform = list(
      charge = function(v){
        purrr::map_chr(v, function(c){
          gsub('multiple_sentence_', '', c)
        })
      }
    )
  ) |>
  filter(!is.na(concurrency))  |>
  dplyr::distinct()

# transforms for a sentences table. in the wide version of the table, create
# columns for the lower and upper bound (in seconds) of the sentencing and add a
# description column by converting the wide column values into factors
sentences_src <- inky_df_src |>
  inky_rename_columns() |>
  mutate(
    across(
      all_of(c(min_sentence_cols, max_sentence_cols)),
      .fns = list(
        lower_bound = function(v){
          sentence_min_bounds[v]
        },
        upper_bound = function(v){
          sentence_max_bounds[v]
        },
        description = haven::as_factor
      ),
      .names = "{.col}||{.fn}"
    )
  ) |>
  # drop the columns that have been converted into bounds and descriptions
  dplyr::select(-all_of(c(
    min_sentence_cols,
    max_sentence_cols
  ))) |>
  # rename the columns to a format where values are separated by '||'
  dplyr::rename_with(function(v){
    purrr::map_chr(v, function(s) {
      if(grepl('^minimum|^maximum', s)){
        return(sub('sentence_', 'sentence||', s))
      }
      s
    })
  })


sentence_details_cols <- names(sentences_src[
  grepl('minimum_sentence|maximum_sentence',  names(sentences_src))])

# sentences spec for pivot_longer
# .name | .value | charge | sentence_type
# example row:
# minimum_sentence||murder||lower_bound | lower_bound | murder | minimum_sentence
sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){
  c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\|\\|')[[1]])
  c(.name=e, .value=value, charge=charge, sentence_type=sentence_type)
})

sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows))

# create our sentences table:
# unique_case_id, sentence type, charge, lower_bound, upper_bound, description
sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] |>
  tidyr::pivot_longer_spec(sentences_spec) |>
  # in our lengthening, we've created a lot of what are null rows so drop any
  # rows where description is na
  dplyr::filter(!is.na(description)) |>
  dplyr::distinct()


# create a cases_inky table that's a little easier to digest: minus all the columns
# that are now lengthened in different tables.
# note the the unique_case_id won't be unique in this table-- there can be
# multiple rows if there were multiple charges
cases_inky <- inky_df |>
  dplyr::select(-all_of(c(
    sentencing_columns,
    disposition_cols,
    probation_cols
  )))
	library(arrow)
	library(daoviz)
	library(dplyr)
	library(ggplot2)
	library(gt)
	library(haven)
	library(lubridate)
	library(purrr)
	library(snakecase)
	library(tidyverse)
	library(zeallot)

	################################################################################
	# notes on the data from the Inquirer stories
	################################################################################
	#
	#
	# from "Public Officals Keep Alive the Myth of 'Equal Justice'", Sunday Feb. 18,
	# 1973:
	#
	# The investigation and computer study centered on the cases of 1034 persons
	# who were indicted during 1971 for at least one of the four major crimes of
	# violence -- murder, rape, aggravated robbery and aggravated assault and
	# battery
	#
	# Using information culled from more than 10,000 court documents and 20,000
	# pages of transcripts of court proceedings, Inquirer reporters traced from
	# beginning to end -- from commission of the crime to trial and sentencing --
	# the cases of the 1,034 defendants
	#
	#
	# from "Court Story Build on 75,000 Facts", Sunday Feb. 18, 1973:
	#
	# The usual way that prosecutors and the courts maintain statistics --
	# computing convictions and acquittals -- is by the individual defendant, but
	# the Inquirer survey measured the outcome of cases from the viewpoint of the
	# victim.
	#
	# Although there were 1,034 defendants in the study, the number of cases in
	# the computer analysis totaled 1,374. The bulk of the difference represents
	# those person who were indicted for two or more separate, major crimes
	# against different victims.
	#
	# The cases of the 1,034 defendants represent a 39 percent sampling of all
	# persons indicted in 1971 for one of the four violent crimes.


	# load spss files
	inquirer_data_path <- daocore::get_data_path(
	'1973_inquirer_series/knight_s609_spss.por')

	inky_df_src <- haven::read_spss(inquirer_data_path, user_na=FALSE,
	.name_repair = 'universal')

	column_names <- names(inky_df_src)
	# var labels are a descriptive name for the column
	var_labels <- labelled::var_label(inky_df_src)
	# val labels are like an enum datatype in a db
	val_labels <- labelled::val_labels(inky_df_src)

	# there are some fields in the data that are missing val labels but have what
	# can be parsed as val labels in the field name. examples:
	# "SEX OF VICTIM: 1-MALE,2-FEMALE"
	# "RACE:1-WHITE,2-BLACK,3-OTHER"
	might_be_enum_field = ".+(\\:\|\\?)+(\\s)*1"

	# for columns that don't have val labels, try and parse out what should be val
	# labels from the column name. These columns follow the format of the `might_be_enum_field` regex above
	parse_val_labels_from_field_name <- function(s){
	parts <- stringr::str_split(s, '\\:\|\\?')
	# toss out the field name, keep the enum-ish string
	vals <- purrr::pluck(parts, 1, 2)
	if(is.null(vals)){
	return(vals)
	}
	items <- lapply(as.list(strsplit(vals, ',')[[1]]), stringr::str_trim)
	reduce(items, function(acc, v){
	sp <- as.list(strsplit(v, '-')[[1]])
	acc[purrr::pluck(sp, 2)] <- as.integer(purrr::pluck(sp, 1))
	acc
	}, .init=c())
	}

	# the inky dataset columns are two digit year
	inky_dataset_date_format <- function(y,m,d){
	lubridate::make_date(y + 1900, m, d)
	}

	# makes a two digit year into a four digit year depending on the specified
	# cutoff. for example the year born column is two digits, if the two digit year
	# is gt cutoff 70, assume 1800
	inky_dataset_year <- function(y, cutoff=70) {
	dplyr::if_else(y > cutoff, y + 1800, y + 1900)
	}

	# this augments the set of labels that came with the dataset. some columns have
	# val labels we can use, but some that should have them are missing them. for
	# some of these columns we can use the value of the var_label for the column to
	# come up with the missing val labels. (defendant race is an example in the
	# dataset)
	labels <- lapply(column_names, function(l){
	key <- as.character(l)
	val <- purrr::pluck(var_labels, key, .default=l)
	label <- purrr::pluck(val_labels, key)
	if(!is.null(label)){
	return(label)
	}
	if(grepl(might_be_enum_field, val)){
	return(parse_val_labels_from_field_name(val))
	}
	})

	# make our labels list a named list using the column names (`V${integer}`)
	names(labels) <- column_names
	labelled::val_labels(inky_df_src) <- labels

	# some column names reference other column names using the "V${integer} style;
	# this function replaces any of those appearances with the readable column name
	# example: "DISPOSITION V94, V104, INTENT TO MURDER"
	# note that a returned referenced column may also have references of its own
	dereference_column_name <- function(col_name){
	refs <- unlist(stringr::str_match_all(col_name, 'V\\d+'))
	if(purrr::is_empty(refs)){
	return(col_name)
	}
	lapply(refs, function(ref){
	val <- dereference_column_name(purrr::pluck(var_labels, ref, .default=''))
	col_name <<- gsub(ref, val, col_name)
	})
	# all the referenced cols are disposition columns, remove only instances of
	# that word that have whitespace before it-- these occur within the concat'd
	# referenced column names
	return(gsub(' DISPOSITION', '', col_name))
	}


	# the codebook pdf says that in addition to values 0 thru 9 as defined here,
	# there's also life sentence "++" but it doesn't seem like that exists in the
	# dataset
	sentence_enum_to_day_range <- list(
	# sentence suspended
	list(ddays(0), ddays(0)),
	# Fines and costs
	list(ddays(0), ddays(0)),
	list(ddays(1), dmonths(3)),
	list(dmonths(4), dmonths(6)),
	list(dmonths(7), dmonths(12)),
	list(dmonths(13), dyears(2)),
	list(dyears(2) + ddays(1), dyears(5)),
	list(dyears(5) + ddays(1), dyears(10)),
	list(dyears(10) + ddays(1), dyears(20)),
	list(dyears(20) + ddays(1), dyears(Inf)),
	# life
	list(dyears(Inf), dyears(Inf))
	)

	sentence_min_bounds <- unlist(
	map(sentence_enum_to_day_range, function(x){x[1]}))
	sentence_max_bounds <- unlist(
	map(sentence_enum_to_day_range, function(x){x[2]}))

	# use a snake-cased version of the columns var labels so the fields are a little
	# more readable
	formatted_col_names <- lapply(column_names, function(e){
	val <- purrr::pluck(var_labels, as.character(e), .default=e)
	val <- dereference_column_name(val)
	# special case to handle the unlabeled columns which we'll ignore
	if(val==''){
	return(e)
	}
	snakecase::to_snake_case(val)
	})

	names(formatted_col_names) <- column_names

	snake_case_columns <- function(v){
	purrr::map_chr(v, function(s) {
	purrr::pluck(formatted_col_names, s)
	})
	}

	# some column names are abbreviated, expand those abbreviations in this function
	standard_column_names <- function(v){
	gsub('aggr_a_b', 'aggr_assault_and_battery', v)
	}

	# wrap up all our column renamings into one function
	inky_rename_columns <- function(df) {
	df \|>
	dplyr::rename_with(snake_case_columns) \|>
	dplyr::rename(all_of(renamings)) \|>
	dplyr::rename_with(standard_column_names)
	}

	# consolidates the set of date fields that exist as three columns and follow a
	# naming pattern into their own date columns and drops the three source columns
	datepart_fields_mutations <- function(df) {
	event_names <- c('arrest', 'prelim_hearing', 'indictment', 'arraignment',
	'trial', 'sentence')
	datepart_names <- c('year_of', 'month_of', 'day_of')
	# new columns we're creating
	new_field_names <- map(event_names, function(e){
	sym(paste0(e, '_date'))
	})

	drop_columns <- unname(unlist(map(event_names, function(e){
	lapply(datepart_names, function(d){
	paste0(d, '_', e)
	})
	})))

	operations <- map(event_names, function(e){
	args <- map(datepart_names, function(d){
	var_name <- paste0(d, "_", e)
	quo(purrr::pluck(!!sym(var_name), 1))
	})
	quo(inky_dataset_date_format(!!!args))
	})
	args <- setNames(
	operations,
	new_field_names
	)
	df \|>
	dplyr::mutate(!!!args) \|>
	dplyr::select(-all_of(drop_columns))
	}

	# a list of manual column renamings
	renamings <- c(
	"race_defendant" = "race_1_white_2_black_3_other",
	"prior_arrests" = "prior_arrests_1_yes_2_no",
	"transcript_available" = "transcript_1_yes_2_no_3_copied",
	"race_judge" = "race_of_judge_1_black_2_white",
	"jury_trial" = "jury_trial_1_yes_2_no",
	"had_transcript" = "was_there_a_transcript_1_yes_2_no",
	"party_of_judge" = "party_of_judge_1_dem_2_rep",
	"was_judge_a_former_da" = "was_judge_a_former_da_1_yes_2_no",
	"race_of_victim" = "race_of_victim_1_white_2_black_3_other",
	"data_compiled_by" = "data_compiled_by_1_barlett_2_steele",
	"sex_of_victim" = "sex_of_victim_1_male_2_female",
	"sex_of_defendant" = "sex_of_defendant_1_male_2_female"
	)

	# do a buncha transforms to consolidate columns
	inky_df <- inky_df_src \|>
	inky_rename_columns() \|>
	# turns all the spss integers into their factor values
	haven::as_factor() \|>
	# now that we've got some nicer column names and values to work with, add some
	# nicer data types
	dplyr::mutate(
	year_born = inky_dataset_year(year_born)
	) \|>
	dplyr::rowwise() \|>
	datepart_fields_mutations() \|>
	# incident date doesn't fit the pattern of the other fields as defined in
	# datepart_fields_mutations() so do that one manually
	dplyr::mutate(
	incident_date = inky_dataset_date_format(
	year_of_incident_last_2_digits[1],
	month_of_incident_1_thru_12[1],
	day_of_incident_1_thru_31[1])
	) \|>
	# drop these columns that were condensed into a single date column
	dplyr::select(-all_of(c(
	'year_of_incident_last_2_digits',
	'month_of_incident_1_thru_12',
	'day_of_incident_1_thru_31'
	))) \|>
	# drop these two columns which are unnamed and null
	dplyr::select(-all_of(c(
	'V67',
	'V138'
	)))

	col_names <- names(inky_df)
	disposition_cols <- names(inky_df[grepl('^disposition', col_names)])
	probation_cols <- names(inky_df[grepl('^probation', col_names)])
	min_sentence_cols <- names(inky_df[grepl('^minimum_sentence', col_names)])
	max_sentence_cols <- names(inky_df[grepl('^maximum_sentence', col_names)])
	multi_sentence_cols <- names(inky_df[grepl('^multiple_sentence', col_names)])

	sentencing_columns <- c(
	min_sentence_cols,
	max_sentence_cols,
	multi_sentence_cols)

	# for debugging-- collect all the names of columns that have a value, summarize
	# as one text field
	summary_column <- function(pattern, columns) {
	return(function(v){
	s <- stringr::str_sort(columns[!is.na(v)])
	paste0(gsub(pattern, '', s), collapse='\|\|')
	})
	}

	summary_columns <- c('charge_dispositions_summary', 'charge_probations_summary', 'charge_minimum_sentences_summary', 'charge_maximum_sentences_summary', 'charge_multi_sentences_summary')

	inky_df <- inky_df \|>
	rowwise() \|>
	# for debugging-- put any values we'll be lengthening into one column so it's easier to digest what data is and is not present for each row
	mutate(
	charge_dispositions_summary = summary_column('^disposition_', disposition_cols)(c_across(all_of(disposition_cols))),
	charge_probations_summary = summary_column('^probation_', probation_cols)(c_across(all_of(probation_cols))),
	charge_minimum_sentences_summary = summary_column('^minimum_sentence_', min_sentence_cols)(c_across(all_of(min_sentence_cols))),
	charge_maximum_sentences_summary = summary_column('^maximum_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols))),
	charge_multi_sentences_summary = summary_column('^multiple_sentence_', max_sentence_cols)(c_across(all_of(max_sentence_cols)))
	)


	# create a dispositions table with three columns: unique_case_id, charge,
	# disposition_outcome
	# example row:
	# unique_case_id \| charge \| disposition_outcome
	# ---------------\|--------------------\|--------------------
	# 7466801 \| murder \| FND GLTY-LSSR OFFNS
	dispositions <- inky_df[c('unique_case_id', disposition_cols)] \|>
	pivot_longer(
	cols = all_of(disposition_cols),
	names_to = c("charge"),
	values_to = c("disposition_outcome"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('disposition_', '', c)
	})
	}
	)
	) \|>
	filter(!is.na(disposition_outcome)) \|>
	dplyr::distinct()

	# create a probations table with three columns: unique_case_id, charge,
	# probation
	# example table:
	# unique_case_id \| charge \| probation
	# ---------------\|--------------------\|--------------------
	# 7466801 \| murder \| YES-NO TIME
	# 7466802 \| murder \| YES-WITH TIME
	#
	# YES-NO TIME: yes, with no time in jail
	# YES-WITH TIME: yes, after time served
	# a YES-WITH TIME value can indicate that jail time was sentenced for another
	# charge in the same case. you can see these when joining on the sentences table
	#
	# a note about probations, from the codebook pdf included with the dataset:
	#
	# treat missing data cautiously! For example, a code of 99 (na) on the
	# probation items may refer to 'not applicable', 'no probation', or 'missing
	# information'. Not applicables can, with a bit of effort, be manipulated into
	# a separate category. There is however, no viable means of separating the
	# rest of the 99's into one category consisting of 'no probation' and another
	# consisting of 'not ascertained' (which may include individuals who did or
	# did no receive probation). In other words, do not assume that everyone in
	# the residual category did not receive a probationary sentence
	#
	# with that caveat, this table is probably not useful for any aggregate analysis
	probations <- inky_df[c('unique_case_id', probation_cols)] \|>
	pivot_longer(
	cols = all_of(probation_cols),
	names_to = c("charge"),
	values_to = c("probation"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('probation_', '', c)
	})
	}
	)
	) \|>
	dplyr::filter(!is.na(probation)) \|>
	dplyr::distinct()

	# CPI index value, Dec 2022
	# https://fred.stlouisfed.org/release/tables?rid=10&eid=36163#
	# https://research.stlouisfed.org/publications/page1-econ/2023/01/03/adjusting-for-inflation
	# default values are current: Dec 2022, past: Dec 1971
	inflation_adjustment <- function(v,
	current_idx_value=294.883,
	past_index_value=43.300) {
	v * (current_idx_value / past_index_value)
	}

	bail_amounts <- inky_df[c('unique_case_id', 'bail_in_wob_999999_nominal_000001', 'charge_dispositions_summary')] \|>
	dplyr::rename(
	'amount'='bail_in_wob_999999_nominal_000001'
	) \|>
	# not sure what a null amount means, so remove
	dplyr::filter(!is.na(amount)) \|>
	mutate (
	# value is actually 99999, note 999999 as noted in original column label
	held_without_bail = if_else(amount == 99999, TRUE, FALSE),
	real_amount = if_else(amount == 99999,
	Inf,
	inflation_adjustment(amount))
	) \|>
	dplyr::distinct()

	multi_sentences <- inky_df_src \|>
	inky_rename_columns() \|>
	mutate(
	across(
	all_of(multi_sentence_cols),
	haven::as_factor
	)
	)

	# a table describing concurrent sentencing (if any) by charge and unique_case_id
	# example table:
	# unique_case_id \| charge \| concurrency
	# ---------------+---------------------+---------------------
	# 26792592 \| assault_and_battery \| YES-CONCURRENT TERMS
	# 26792592 \| aggravated_robbery \| NO MULT SENT
	# 36052400 \| attempted_larceny \| YES-BOTH
	multi_sentences <- multi_sentences[c('unique_case_id', multi_sentence_cols)] \|>
	pivot_longer(
	cols = all_of(multi_sentence_cols),
	names_to = c("charge"),
	values_to = c("concurrency"),
	names_transform = list(
	charge = function(v){
	purrr::map_chr(v, function(c){
	gsub('multiple_sentence_', '', c)
	})
	}
	)
	) \|>
	filter(!is.na(concurrency)) \|>
	dplyr::distinct()

	# transforms for a sentences table. in the wide version of the table, create
	# columns for the lower and upper bound (in seconds) of the sentencing and add a
	# description column by converting the wide column values into factors
	sentences_src <- inky_df_src \|>
	inky_rename_columns() \|>
	mutate(
	across(
	all_of(c(min_sentence_cols, max_sentence_cols)),
	.fns = list(
	lower_bound = function(v){
	sentence_min_bounds[v]
	},
	upper_bound = function(v){
	sentence_max_bounds[v]
	},
	description = haven::as_factor
	),
	.names = "{.col}\|\|{.fn}"
	)
	) \|>
	# drop the columns that have been converted into bounds and descriptions
	dplyr::select(-all_of(c(
	min_sentence_cols,
	max_sentence_cols
	))) \|>
	# rename the columns to a format where values are separated by '\|\|'
	dplyr::rename_with(function(v){
	purrr::map_chr(v, function(s) {
	if(grepl('^minimum\|^maximum', s)){
	return(sub('sentence_', 'sentence\|\|', s))
	}
	s
	})
	})


	sentence_details_cols <- names(sentences_src[
	grepl('minimum_sentence\|maximum_sentence', names(sentences_src))])

	# sentences spec for pivot_longer
	# .name \| .value \| charge \| sentence_type
	# example row:
	# minimum_sentence\|\|murder\|\|lower_bound \| lower_bound \| murder \| minimum_sentence
	sentences_spec_rows <- purrr::map(sentence_details_cols, function(e){
	c(sentence_type, charge, value) %<-% as.list(strsplit(e, '\\\|\\\|')[[1]])
	c(.name=e, .value=value, charge=charge, sentence_type=sentence_type)
	})

	sentences_spec <- as.data.frame(do.call(rbind, sentences_spec_rows))

	# create our sentences table:
	# unique_case_id, sentence type, charge, lower_bound, upper_bound, description
	sentences <- sentences_src[c('unique_case_id', sentence_details_cols)] \|>
	tidyr::pivot_longer_spec(sentences_spec) \|>
	# in our lengthening, we've created a lot of what are null rows so drop any
	# rows where description is na
	dplyr::filter(!is.na(description)) \|>
	dplyr::distinct()


	# create a cases_inky table that's a little easier to digest: minus all the columns
	# that are now lengthened in different tables.
	# note the the unique_case_id won't be unique in this table-- there can be
	# multiple rows if there were multiple charges
	cases_inky <- inky_df \|>
	dplyr::select(-all_of(c(
	sentencing_columns,
	disposition_cols,
	probation_cols
	)))