chris-prener/parse-bocks.R

## parse-bocks.R
# Parse Census Bureau address range data

# Problem:
# We need a way to match incidents (that have address-level data) to the blockface they occur on.
# By blockface, I mean the houses on either side of a street between two cross streets (i.e.
# the 100-block of Main Street between 1st and 2nd Avenues).
#
# Typically, census block and city block shapefiles do not represent a blockface. Instead,
# they have parts of up to four different streets, typically representing half of a blockface
# for each of the included streets. So a block bounded by Main Street on the south, 1st Avenue
# on the west, Washington Street on the north, and 2nd Avenue on the east would contain addresses
# from each of the four streets that form the block's boundaries. Joining addresses to traditional
# blocks therefore produces entities that are distinct from the way we think about a blockface.
#
# We need to be able to do two things - identify incidents that occur on "marked" blocks, and then
# also produce counts of incidents for all blocks.
#
# The census bureau pubishes something that could be helpful - line data with block segements.
# These include the low and high address numbers for both sides of the street (i.e. the blockface).
# However, I need a quick way to apply the blockface identification numbers to individual
# addresses in the incident data.

# dependencies
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)

# custom function to create a list-column with every other integer value between the low
# and high values for a given side of the blockface
parse_range <- function(x){

  # convert item to numeric
  vector <- as.numeric(x)

  # expand vector to include every other integer between low and high values
  out <- seq.int(from = vector[1], to = vector[2], by = 2)

  # return output
  return(out)

}

# sample block data
blocks <- tibble(
  bfId = c(1,2,3,4),
  rightLow = c(400,500,600,700),
  rightHigh = c(498,598,698,798),
  leftLow = c(401,501,601,701),
  leftHigh = c(499,599,699,799),
  street = c("Main St", "Main St", "Main St", "Main St"),
  marked = c(TRUE, FALSE, FALSE, FALSE)
)

# sample incident data
incidents <- tibble(
  callId = c(101, 102, 103, 104, 105),
  address = c("424 Main St", "447 Main St", "504 Main St", "667 Main St", "773 Main St"),
  date = c("1/1/14", "3/6/14", "5/12/14", "4/19/14", "2/12/14"),
  call = c("Graffiti", "Graffiti", "Pothole", "Lights Out", "Vacant Building")
)

# convert ranges into individual records, right side of street
blocks %>%
  select(-c(leftLow, leftHigh)) %>%
  mutate(
    rightRange = str_split(string = str_c(as.character(rightLow), "-", as.character(rightHigh)), pattern = "-")
  ) %>%
  mutate(rightRange = map(.x = rightRange, .f = parse_range)) %>%
  unnest() %>%
  select(-c(rightLow, rightHigh)) %>%
  rename(house = rightRange) %>%
  select(bfId, house, street, marked) -> right

# convert ranges into individual records, left side of street
blocks %>%
  select(-c(rightLow, rightHigh)) %>%
  mutate(
    leftRange = str_split(string = str_c(as.character(leftLow), "-", as.character(leftHigh)),  pattern = "-")
  ) %>%
  mutate(leftRange = map(.x = leftRange, .f = parse_range)) %>%
  unnest() %>%
  select(-c(leftLow, leftHigh)) %>%
  rename(house = leftRange) %>%
  select(bfId, house, street, marked) -> left

# combine left and rigt side of street data
bind_rows(right, left) %>%
  arrange(bfId, house) %>%
  mutate(address = str_c(house, street, sep = " ")) -> master

# combine master and incident data to apply blockface ids and
# the logical indicator of a "marked" block to each incident
master %>%
  select(bfId, address, marked) %>%
  left_join(incidents, ., by = "address") -> incidentsWithBlock

# calculate counts per blockface
incidentsWithBlock %>%
  group_by(bfId) %>%
  summarise(count = n()) %>%
  left_join(blocks, ., by = "bfId") -> countsByBlock
	# Parse Census Bureau address range data

	# Problem:
	# We need a way to match incidents (that have address-level data) to the blockface they occur on.
	# By blockface, I mean the houses on either side of a street between two cross streets (i.e.
	# the 100-block of Main Street between 1st and 2nd Avenues).
	#
	# Typically, census block and city block shapefiles do not represent a blockface. Instead,
	# they have parts of up to four different streets, typically representing half of a blockface
	# for each of the included streets. So a block bounded by Main Street on the south, 1st Avenue
	# on the west, Washington Street on the north, and 2nd Avenue on the east would contain addresses
	# from each of the four streets that form the block's boundaries. Joining addresses to traditional
	# blocks therefore produces entities that are distinct from the way we think about a blockface.
	#
	# We need to be able to do two things - identify incidents that occur on "marked" blocks, and then
	# also produce counts of incidents for all blocks.
	#
	# The census bureau pubishes something that could be helpful - line data with block segements.
	# These include the low and high address numbers for both sides of the street (i.e. the blockface).
	# However, I need a quick way to apply the blockface identification numbers to individual
	# addresses in the incident data.

	# dependencies
	library(dplyr)
	library(purrr)
	library(stringr)
	library(tidyr)

	# custom function to create a list-column with every other integer value between the low
	# and high values for a given side of the blockface
	parse_range <- function(x){

	# convert item to numeric
	vector <- as.numeric(x)

	# expand vector to include every other integer between low and high values
	out <- seq.int(from = vector[1], to = vector[2], by = 2)

	# return output
	return(out)

	}

	# sample block data
	blocks <- tibble(
	bfId = c(1,2,3,4),
	rightLow = c(400,500,600,700),
	rightHigh = c(498,598,698,798),
	leftLow = c(401,501,601,701),
	leftHigh = c(499,599,699,799),
	street = c("Main St", "Main St", "Main St", "Main St"),
	marked = c(TRUE, FALSE, FALSE, FALSE)
	)

	# sample incident data
	incidents <- tibble(
	callId = c(101, 102, 103, 104, 105),
	address = c("424 Main St", "447 Main St", "504 Main St", "667 Main St", "773 Main St"),
	date = c("1/1/14", "3/6/14", "5/12/14", "4/19/14", "2/12/14"),
	call = c("Graffiti", "Graffiti", "Pothole", "Lights Out", "Vacant Building")
	)

	# convert ranges into individual records, right side of street
	blocks %>%
	select(-c(leftLow, leftHigh)) %>%
	mutate(
	rightRange = str_split(string = str_c(as.character(rightLow), "-", as.character(rightHigh)), pattern = "-")
	) %>%
	mutate(rightRange = map(.x = rightRange, .f = parse_range)) %>%
	unnest() %>%
	select(-c(rightLow, rightHigh)) %>%
	rename(house = rightRange) %>%
	select(bfId, house, street, marked) -> right

	# convert ranges into individual records, left side of street
	blocks %>%
	select(-c(rightLow, rightHigh)) %>%
	mutate(
	leftRange = str_split(string = str_c(as.character(leftLow), "-", as.character(leftHigh)), pattern = "-")
	) %>%
	mutate(leftRange = map(.x = leftRange, .f = parse_range)) %>%
	unnest() %>%
	select(-c(leftLow, leftHigh)) %>%
	rename(house = leftRange) %>%
	select(bfId, house, street, marked) -> left

	# combine left and rigt side of street data
	bind_rows(right, left) %>%
	arrange(bfId, house) %>%
	mutate(address = str_c(house, street, sep = " ")) -> master

	# combine master and incident data to apply blockface ids and
	# the logical indicator of a "marked" block to each incident
	master %>%
	select(bfId, address, marked) %>%
	left_join(incidents, ., by = "address") -> incidentsWithBlock

	# calculate counts per blockface
	incidentsWithBlock %>%
	group_by(bfId) %>%
	summarise(count = n()) %>%
	left_join(blocks, ., by = "bfId") -> countsByBlock