seasmith/houston_crime_data.R

## houston_crime_data.R
# A final version of the image created below
# can be found at: https://twitter.com/lksmth/status/892849692576329730

# -- Load dependencies

library(tidyverse)
library(stringi)
library(readxl)
library(rvest)

# -- Prep for download

# url to get data from
url <- "http://www.houstontx.gov/police/cs/crime-stats-archives.htm"

# Get links from url
links <- url %>%
  read_html() %>%
  html_nodes(xpath = "//div/a") %>%
  html_attr("href") %>%
  .[grepl("\\.xls$", .)]

# Remove beginning section of links and replace with base url
links <- stri_replace(links, "", regex = "^/police/cs/")
base  <- "http://www.houstontx.gov/police/cs/"
new_links <- paste0(base, links)

# Set destinations for Excel files
dest <- "hou_cr_data"  # you may want to change this
dir.create(dest)
dest_paths <- paste0(dest, stri_replace(links, "", regex = "^xls"))

# -- Download the data

# !This will take a while
#  Each file is about 2MB
#  There are 91 files
hou_cr_dwnld <- new_links %>%
  map2(dest_paths, function(x, y) {
  download.file(x, y, mode = "wb")
  })

# -- Import into R

# Declare ranges on certain files to avoid fread() errors
hou_cr <- list.files(dest, full.names = TRUE) %>%
  map(function(x) {
    if (grepl("jul09\\.xls$", x)) {
      read_xls(x, range = "A1:J12922")
    } else if (grepl("nov09\\.xls$", x)) {
      read_xls(x, range = "A1:J11903")
    } else if (grepl("sep09\\.xls$", x)) {
      read_xls(x, range = "A1:J12422")
    } else if (grepl("sep10\\.xls$", x)) {
      read_xls(x, range = "A1:J10921")
    } else {
      read_xls(x)
    }
  })

# -- Clean up tibbles
#    Tibbles have varying column names and number of columns

# Reorganize tibbles by number of columns
reorg <- function(dat) {
  map(unique(lens <- lengths(dat)), function(i) {
    dat[which(lens == i)]
  })
}

# Reorganize (returns nested list of tibbles)
hou_cr_reorg <- reorg(hou_cr)

# Find unique column name sets
nms <- hou_cr_reorg %>%
  at_depth(2, colnames) %>%
  at_depth(2, as.matrix) %>%
  map(function(x) t(reduce(x, cbind))) %>%
  map(as.data.frame) %>%
  map(as_tibble) %>%
  map(distinct)

  # Select "proper" column names
  nms_10 <- nms[[1]][1, ] %>% as_vector() %>% as.character()
  nms_11 <- nms[[2]] %>% as_vector() %>% as.character()
  nms_9  <- nms[[3]][1,] %>% as_vector() %>% as.character()
  nms_9[[9]] <- nms_10[[10]]  # need to rename `# Offenses` to `# Of Offenses`

# Set "proper" column names
hou_cr_reorg[1] <- hou_cr_reorg[[1]] %>%
  map(function(x) setNames(x, nms_10)) %>%
  list()
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
  map(function(x) setNames(x, nms_9)) %>%
  list()

# Remove "anomalous" column X__1
hou_cr_reorg[2] <- hou_cr_reorg[[2]] %>%
  map(function(x) select(x, -X__1)) %>%
  list()

# Add column "Premise" to tibbles that have only 9 columns
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
  map(function(x) add_column(x, Premise = NA_character_)) %>%
  list()

# Unionize all tibbles in hou_cr
hou_cr <- hou_cr_reorg %>%
  map(function(x) reduce(x, function(...) union_all(...))) %>%
  reduce(function(...) union_all(...))

# -- Plot

library(zoo)
library(forcats)
library(extrafont)
library(grid)
library(gridExtra)

loadfonts("win")

# Filtered data set
hou_cr_filtered <- hou_cr %>%
  filter(as.yearmon(Date) >= 2009.5 & as.yearmon(Date) < 2017) %>%
  filter(!is.na(as.yearmon(Date))) %>%
  filter(`Offense Type` != "1") %>%
  mutate(Hour = if_else(Hour == "24", "00", Hour)) %>%
  mutate(Hour = stri_replace(Hour, "", regex = "'"))

# Summary data
hou_cr_summ <- hou_cr_filtered %>%
  group_by(`Offense Type`,
           Date = as.yearmon(Date)) %>%
  summarize(`Total number of offenses` = sum(`# Of Offenses`, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(`Offense Type`) %>%
  mutate(`Number of offenses` = rollmean(`Total number of offenses`, 6, fill = NA, align = "right")) %>%
  ungroup()

my_colors <- c("#984807", "#e15c39", "#008348", "#4c7093", "#d6a300", "#ce1141", "#8E3694")

# - Lines
hou_plots$lines <- hou_cr_summ %>%
  ggplot(aes(Date)) +
  geom_line(aes(y = `Number of offenses`, group = `Offense Type`), size = 0.75) +
  geom_line(aes(y = `Total number of offenses`, color = `Offense Type`), size = 0.82) +
  facet_wrap(~forcats::fct_rev(reorder(`Offense Type`, `Total number of offenses`, sum)), scales = "free_y", ncol = 2) +
  scale_x_yearmon(breaks = c(2010, 2016),
                  minor_breaks = NULL) +
  scale_color_manual(values = c("#4c7093", my_green, my_orange, my_purple, my_red, "#d6a300", "#984807")) +
  guides(color = FALSE) +
  labs(title = paste0("Crime in Houston from July 2009 to",
                      "\nDecember 2016"),
       subtitle = paste0("Monthly totals and six-month lagging averages",
                         "\n(black line) show a drop in burglary but a rise",
                         "\nin rape and murder.",
                         "\n",
                         "\n"),
       caption = "\n ") +
  theme_minimal() +
  theme(text = element_text(family = "Open Sans", size = 16),
        panel.grid.minor = element_blank())

# - Roses
# Rose function
rose_diagrams <- function(var, clr) {
  p <- ggplot(filter(hou_cr_Hour_count, `Offense Type` == var)) +
    geom_bar(aes(as.integer(Hour), n_Hour), fill = clr, color = "black",
             stat = "identity") +
    labs(title = paste0(var)) +
    scale_y_continuous(name = NULL,
                       labels = NULL) +
    scale_x_continuous(name = NULL,
                       breaks = c(0, 6, 12, 18),
                       labels = c("12:00 am", "6:00 am",
                                  "12:00 pm", "6:00 pm")) +
    coord_polar() +
    theme_minimal() +
    theme(text = element_text(family = "Open Sans", size = 11),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          plot.margin = unit(c(0, 0, 0, 0), "cm"))

  pg <- ggplotGrob(p)

  # Turn 'panel' clipping to 'off'
  pg$layout$clip[[which(pg$layout$name == "panel")]] <- "off"

  return(pg)
}


# Roses
rose_garden <- Map(rose_diagrams,
                   c("Theft", "Burglary", "Auto Theft", "Aggravated Assault", "Robbery", "Rape", "Murder"),
                   plt_colors)
grid.arrange(roses <- arrangeGrob(grobs = rose_garden, ncol = 2, padding = unit(-1, "lines"), top = ""))

# - Canvas
blanky <- ggplot() +
  geom_blank() +
  labs(title = paste0("When do crimes most often occur during",
                      "\nthe day?"),
       subtitle = paste0("Violent crimes occur most often during the evening",
                         "\nand night hours when people are socially active at",
                         "\nbars and restaurants; while burglary occurs during",
                         "\nthe day when people are away from home.",
                         "\n"),
       caption = "Source: City of Houston; Houston Police Department") +
  theme_minimal() +
  theme(text = element_text(family = "Open Sans", size = 16))

# Grobbify and add roses to bg's 'panel'
bg <- ggplotGrob(blanky)
widths <- bg
bg$grobs[[which(bg$layout$name == "panel")]] <- addGrob(bg$grobs[[which(bg$layout$name == "panel")]], roses)

# Assign and convert
hou_plots$roses <- bg
hou_plots$lines <- ggplotGrob(hou_plots$lines)

# Get and set heights
p_heights <- unit.pmax(hou_plots$lines$heights[2:5], hou_plots$roses$heights[2:5])
hou_plots$lines$heights[2:5] <- as.list(p_heights)
hou_plots$roses$heights[2:5] <- as.list(p_heights)

# - Print and save
png("Houston_Crime_Plot.png", width = 900, height = 751)
grid.arrange(hou_plots$lines, hou_plots$roses, ncol = 2)
dev.off()
	# A final version of the image created below
	# can be found at: https://twitter.com/lksmth/status/892849692576329730

	# -- Load dependencies

	library(tidyverse)
	library(stringi)
	library(readxl)
	library(rvest)

	# -- Prep for download

	# url to get data from
	url <- "http://www.houstontx.gov/police/cs/crime-stats-archives.htm"

	# Get links from url
	links <- url %>%
	read_html() %>%
	html_nodes(xpath = "//div/a") %>%
	html_attr("href") %>%
	.[grepl("\\.xls$", .)]

	# Remove beginning section of links and replace with base url
	links <- stri_replace(links, "", regex = "^/police/cs/")
	base <- "http://www.houstontx.gov/police/cs/"
	new_links <- paste0(base, links)

	# Set destinations for Excel files
	dest <- "hou_cr_data" # you may want to change this
	dir.create(dest)
	dest_paths <- paste0(dest, stri_replace(links, "", regex = "^xls"))

	# -- Download the data

	# !This will take a while
	# Each file is about 2MB
	# There are 91 files
	hou_cr_dwnld <- new_links %>%
	map2(dest_paths, function(x, y) {
	download.file(x, y, mode = "wb")
	})

	# -- Import into R

	# Declare ranges on certain files to avoid fread() errors
	hou_cr <- list.files(dest, full.names = TRUE) %>%
	map(function(x) {
	if (grepl("jul09\\.xls$", x)) {
	read_xls(x, range = "A1:J12922")
	} else if (grepl("nov09\\.xls$", x)) {
	read_xls(x, range = "A1:J11903")
	} else if (grepl("sep09\\.xls$", x)) {
	read_xls(x, range = "A1:J12422")
	} else if (grepl("sep10\\.xls$", x)) {
	read_xls(x, range = "A1:J10921")
	} else {
	read_xls(x)
	}
	})

	# -- Clean up tibbles
	# Tibbles have varying column names and number of columns

	# Reorganize tibbles by number of columns
	reorg <- function(dat) {
	map(unique(lens <- lengths(dat)), function(i) {
	dat[which(lens == i)]
	})
	}

	# Reorganize (returns nested list of tibbles)
	hou_cr_reorg <- reorg(hou_cr)

	# Find unique column name sets
	nms <- hou_cr_reorg %>%
	at_depth(2, colnames) %>%
	at_depth(2, as.matrix) %>%
	map(function(x) t(reduce(x, cbind))) %>%
	map(as.data.frame) %>%
	map(as_tibble) %>%
	map(distinct)

	# Select "proper" column names
	nms_10 <- nms[[1]][1, ] %>% as_vector() %>% as.character()
	nms_11 <- nms[[2]] %>% as_vector() %>% as.character()
	nms_9 <- nms[[3]][1,] %>% as_vector() %>% as.character()
	nms_9[[9]] <- nms_10[[10]] # need to rename `# Offenses` to `# Of Offenses`

	# Set "proper" column names
	hou_cr_reorg[1] <- hou_cr_reorg[[1]] %>%
	map(function(x) setNames(x, nms_10)) %>%
	list()
	hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
	map(function(x) setNames(x, nms_9)) %>%
	list()

	# Remove "anomalous" column X__1
	hou_cr_reorg[2] <- hou_cr_reorg[[2]] %>%
	map(function(x) select(x, -X__1)) %>%
	list()

	# Add column "Premise" to tibbles that have only 9 columns
	hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
	map(function(x) add_column(x, Premise = NA_character_)) %>%
	list()

	# Unionize all tibbles in hou_cr
	hou_cr <- hou_cr_reorg %>%
	map(function(x) reduce(x, function(...) union_all(...))) %>%
	reduce(function(...) union_all(...))

	# -- Plot

	library(zoo)
	library(forcats)
	library(extrafont)
	library(grid)
	library(gridExtra)

	loadfonts("win")

	# Filtered data set
	hou_cr_filtered <- hou_cr %>%
	filter(as.yearmon(Date) >= 2009.5 & as.yearmon(Date) < 2017) %>%
	filter(!is.na(as.yearmon(Date))) %>%
	filter(`Offense Type` != "1") %>%
	mutate(Hour = if_else(Hour == "24", "00", Hour)) %>%
	mutate(Hour = stri_replace(Hour, "", regex = "'"))

	# Summary data
	hou_cr_summ <- hou_cr_filtered %>%
	group_by(`Offense Type`,
	Date = as.yearmon(Date)) %>%
	summarize(`Total number of offenses` = sum(`# Of Offenses`, na.rm = TRUE)) %>%
	ungroup() %>%
	group_by(`Offense Type`) %>%
	mutate(`Number of offenses` = rollmean(`Total number of offenses`, 6, fill = NA, align = "right")) %>%
	ungroup()

	my_colors <- c("#984807", "#e15c39", "#008348", "#4c7093", "#d6a300", "#ce1141", "#8E3694")

	# - Lines
	hou_plots$lines <- hou_cr_summ %>%
	ggplot(aes(Date)) +
	geom_line(aes(y = `Number of offenses`, group = `Offense Type`), size = 0.75) +
	geom_line(aes(y = `Total number of offenses`, color = `Offense Type`), size = 0.82) +
	facet_wrap(~forcats::fct_rev(reorder(`Offense Type`, `Total number of offenses`, sum)), scales = "free_y", ncol = 2) +
	scale_x_yearmon(breaks = c(2010, 2016),
	minor_breaks = NULL) +
	scale_color_manual(values = c("#4c7093", my_green, my_orange, my_purple, my_red, "#d6a300", "#984807")) +
	guides(color = FALSE) +
	labs(title = paste0("Crime in Houston from July 2009 to",
	"\nDecember 2016"),
	subtitle = paste0("Monthly totals and six-month lagging averages",
	"\n(black line) show a drop in burglary but a rise",
	"\nin rape and murder.",
	"\n",
	"\n"),
	caption = "\n ") +
	theme_minimal() +
	theme(text = element_text(family = "Open Sans", size = 16),
	panel.grid.minor = element_blank())

	# - Roses
	# Rose function
	rose_diagrams <- function(var, clr) {
	p <- ggplot(filter(hou_cr_Hour_count, `Offense Type` == var)) +
	geom_bar(aes(as.integer(Hour), n_Hour), fill = clr, color = "black",
	stat = "identity") +
	labs(title = paste0(var)) +
	scale_y_continuous(name = NULL,
	labels = NULL) +
	scale_x_continuous(name = NULL,
	breaks = c(0, 6, 12, 18),
	labels = c("12:00 am", "6:00 am",
	"12:00 pm", "6:00 pm")) +
	coord_polar() +
	theme_minimal() +
	theme(text = element_text(family = "Open Sans", size = 11),
	panel.grid.major.y = element_blank(),
	panel.grid.minor.y = element_blank(),
	plot.margin = unit(c(0, 0, 0, 0), "cm"))

	pg <- ggplotGrob(p)

	# Turn 'panel' clipping to 'off'
	pg$layout$clip[[which(pg$layout$name == "panel")]] <- "off"

	return(pg)
	}


	# Roses
	rose_garden <- Map(rose_diagrams,
	c("Theft", "Burglary", "Auto Theft", "Aggravated Assault", "Robbery", "Rape", "Murder"),
	plt_colors)
	grid.arrange(roses <- arrangeGrob(grobs = rose_garden, ncol = 2, padding = unit(-1, "lines"), top = ""))

	# - Canvas
	blanky <- ggplot() +
	geom_blank() +
	labs(title = paste0("When do crimes most often occur during",
	"\nthe day?"),
	subtitle = paste0("Violent crimes occur most often during the evening",
	"\nand night hours when people are socially active at",
	"\nbars and restaurants; while burglary occurs during",
	"\nthe day when people are away from home.",
	"\n"),
	caption = "Source: City of Houston; Houston Police Department") +
	theme_minimal() +
	theme(text = element_text(family = "Open Sans", size = 16))

	# Grobbify and add roses to bg's 'panel'
	bg <- ggplotGrob(blanky)
	widths <- bg
	bg$grobs[[which(bg$layout$name == "panel")]] <- addGrob(bg$grobs[[which(bg$layout$name == "panel")]], roses)

	# Assign and convert
	hou_plots$roses <- bg
	hou_plots$lines <- ggplotGrob(hou_plots$lines)

	# Get and set heights
	p_heights <- unit.pmax(hou_plots$lines$heights[2:5], hou_plots$roses$heights[2:5])
	hou_plots$lines$heights[2:5] <- as.list(p_heights)
	hou_plots$roses$heights[2:5] <- as.list(p_heights)

	# - Print and save
	png("Houston_Crime_Plot.png", width = 900, height = 751)
	grid.arrange(hou_plots$lines, hou_plots$roses, ncol = 2)
	dev.off()