Skip to content

Instantly share code, notes, and snippets.

@seasmith
Last active August 17, 2017 18:08
Show Gist options
  • Save seasmith/3dfbd76cf2f12f185ef86ae8b4528874 to your computer and use it in GitHub Desktop.
Save seasmith/3dfbd76cf2f12f185ef86ae8b4528874 to your computer and use it in GitHub Desktop.
Download and union all Houston crime report Excel files
# A final version of the image created below
# can be found at: https://twitter.com/lksmth/status/892849692576329730
# -- Load dependencies
library(tidyverse)
library(stringi)
library(readxl)
library(rvest)
# -- Prep for download
# url to get data from
url <- "http://www.houstontx.gov/police/cs/crime-stats-archives.htm"
# Get links from url
links <- url %>%
read_html() %>%
html_nodes(xpath = "//div/a") %>%
html_attr("href") %>%
.[grepl("\\.xls$", .)]
# Remove beginning section of links and replace with base url
links <- stri_replace(links, "", regex = "^/police/cs/")
base <- "http://www.houstontx.gov/police/cs/"
new_links <- paste0(base, links)
# Set destinations for Excel files
dest <- "hou_cr_data" # you may want to change this
dir.create(dest)
dest_paths <- paste0(dest, stri_replace(links, "", regex = "^xls"))
# -- Download the data
# !This will take a while
# Each file is about 2MB
# There are 91 files
hou_cr_dwnld <- new_links %>%
map2(dest_paths, function(x, y) {
download.file(x, y, mode = "wb")
})
# -- Import into R
# Declare ranges on certain files to avoid fread() errors
hou_cr <- list.files(dest, full.names = TRUE) %>%
map(function(x) {
if (grepl("jul09\\.xls$", x)) {
read_xls(x, range = "A1:J12922")
} else if (grepl("nov09\\.xls$", x)) {
read_xls(x, range = "A1:J11903")
} else if (grepl("sep09\\.xls$", x)) {
read_xls(x, range = "A1:J12422")
} else if (grepl("sep10\\.xls$", x)) {
read_xls(x, range = "A1:J10921")
} else {
read_xls(x)
}
})
# -- Clean up tibbles
# Tibbles have varying column names and number of columns
# Reorganize tibbles by number of columns
reorg <- function(dat) {
map(unique(lens <- lengths(dat)), function(i) {
dat[which(lens == i)]
})
}
# Reorganize (returns nested list of tibbles)
hou_cr_reorg <- reorg(hou_cr)
# Find unique column name sets
nms <- hou_cr_reorg %>%
at_depth(2, colnames) %>%
at_depth(2, as.matrix) %>%
map(function(x) t(reduce(x, cbind))) %>%
map(as.data.frame) %>%
map(as_tibble) %>%
map(distinct)
# Select "proper" column names
nms_10 <- nms[[1]][1, ] %>% as_vector() %>% as.character()
nms_11 <- nms[[2]] %>% as_vector() %>% as.character()
nms_9 <- nms[[3]][1,] %>% as_vector() %>% as.character()
nms_9[[9]] <- nms_10[[10]] # need to rename `# Offenses` to `# Of Offenses`
# Set "proper" column names
hou_cr_reorg[1] <- hou_cr_reorg[[1]] %>%
map(function(x) setNames(x, nms_10)) %>%
list()
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
map(function(x) setNames(x, nms_9)) %>%
list()
# Remove "anomalous" column X__1
hou_cr_reorg[2] <- hou_cr_reorg[[2]] %>%
map(function(x) select(x, -X__1)) %>%
list()
# Add column "Premise" to tibbles that have only 9 columns
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>%
map(function(x) add_column(x, Premise = NA_character_)) %>%
list()
# Unionize all tibbles in hou_cr
hou_cr <- hou_cr_reorg %>%
map(function(x) reduce(x, function(...) union_all(...))) %>%
reduce(function(...) union_all(...))
# -- Plot
library(zoo)
library(forcats)
library(extrafont)
library(grid)
library(gridExtra)
loadfonts("win")
# Filtered data set
hou_cr_filtered <- hou_cr %>%
filter(as.yearmon(Date) >= 2009.5 & as.yearmon(Date) < 2017) %>%
filter(!is.na(as.yearmon(Date))) %>%
filter(`Offense Type` != "1") %>%
mutate(Hour = if_else(Hour == "24", "00", Hour)) %>%
mutate(Hour = stri_replace(Hour, "", regex = "'"))
# Summary data
hou_cr_summ <- hou_cr_filtered %>%
group_by(`Offense Type`,
Date = as.yearmon(Date)) %>%
summarize(`Total number of offenses` = sum(`# Of Offenses`, na.rm = TRUE)) %>%
ungroup() %>%
group_by(`Offense Type`) %>%
mutate(`Number of offenses` = rollmean(`Total number of offenses`, 6, fill = NA, align = "right")) %>%
ungroup()
my_colors <- c("#984807", "#e15c39", "#008348", "#4c7093", "#d6a300", "#ce1141", "#8E3694")
# - Lines
hou_plots$lines <- hou_cr_summ %>%
ggplot(aes(Date)) +
geom_line(aes(y = `Number of offenses`, group = `Offense Type`), size = 0.75) +
geom_line(aes(y = `Total number of offenses`, color = `Offense Type`), size = 0.82) +
facet_wrap(~forcats::fct_rev(reorder(`Offense Type`, `Total number of offenses`, sum)), scales = "free_y", ncol = 2) +
scale_x_yearmon(breaks = c(2010, 2016),
minor_breaks = NULL) +
scale_color_manual(values = c("#4c7093", my_green, my_orange, my_purple, my_red, "#d6a300", "#984807")) +
guides(color = FALSE) +
labs(title = paste0("Crime in Houston from July 2009 to",
"\nDecember 2016"),
subtitle = paste0("Monthly totals and six-month lagging averages",
"\n(black line) show a drop in burglary but a rise",
"\nin rape and murder.",
"\n",
"\n"),
caption = "\n ") +
theme_minimal() +
theme(text = element_text(family = "Open Sans", size = 16),
panel.grid.minor = element_blank())
# - Roses
# Rose function
rose_diagrams <- function(var, clr) {
p <- ggplot(filter(hou_cr_Hour_count, `Offense Type` == var)) +
geom_bar(aes(as.integer(Hour), n_Hour), fill = clr, color = "black",
stat = "identity") +
labs(title = paste0(var)) +
scale_y_continuous(name = NULL,
labels = NULL) +
scale_x_continuous(name = NULL,
breaks = c(0, 6, 12, 18),
labels = c("12:00 am", "6:00 am",
"12:00 pm", "6:00 pm")) +
coord_polar() +
theme_minimal() +
theme(text = element_text(family = "Open Sans", size = 11),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.margin = unit(c(0, 0, 0, 0), "cm"))
pg <- ggplotGrob(p)
# Turn 'panel' clipping to 'off'
pg$layout$clip[[which(pg$layout$name == "panel")]] <- "off"
return(pg)
}
# Roses
rose_garden <- Map(rose_diagrams,
c("Theft", "Burglary", "Auto Theft", "Aggravated Assault", "Robbery", "Rape", "Murder"),
plt_colors)
grid.arrange(roses <- arrangeGrob(grobs = rose_garden, ncol = 2, padding = unit(-1, "lines"), top = ""))
# - Canvas
blanky <- ggplot() +
geom_blank() +
labs(title = paste0("When do crimes most often occur during",
"\nthe day?"),
subtitle = paste0("Violent crimes occur most often during the evening",
"\nand night hours when people are socially active at",
"\nbars and restaurants; while burglary occurs during",
"\nthe day when people are away from home.",
"\n"),
caption = "Source: City of Houston; Houston Police Department") +
theme_minimal() +
theme(text = element_text(family = "Open Sans", size = 16))
# Grobbify and add roses to bg's 'panel'
bg <- ggplotGrob(blanky)
widths <- bg
bg$grobs[[which(bg$layout$name == "panel")]] <- addGrob(bg$grobs[[which(bg$layout$name == "panel")]], roses)
# Assign and convert
hou_plots$roses <- bg
hou_plots$lines <- ggplotGrob(hou_plots$lines)
# Get and set heights
p_heights <- unit.pmax(hou_plots$lines$heights[2:5], hou_plots$roses$heights[2:5])
hou_plots$lines$heights[2:5] <- as.list(p_heights)
hou_plots$roses$heights[2:5] <- as.list(p_heights)
# - Print and save
png("Houston_Crime_Plot.png", width = 900, height = 751)
grid.arrange(hou_plots$lines, hou_plots$roses, ncol = 2)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment