Last active
August 17, 2017 18:08
-
-
Save seasmith/3dfbd76cf2f12f185ef86ae8b4528874 to your computer and use it in GitHub Desktop.
Download and union all Houston crime report Excel files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A final version of the image created below | |
# can be found at: https://twitter.com/lksmth/status/892849692576329730 | |
# -- Load dependencies | |
library(tidyverse) | |
library(stringi) | |
library(readxl) | |
library(rvest) | |
# -- Prep for download | |
# url to get data from | |
url <- "http://www.houstontx.gov/police/cs/crime-stats-archives.htm" | |
# Get links from url | |
links <- url %>% | |
read_html() %>% | |
html_nodes(xpath = "//div/a") %>% | |
html_attr("href") %>% | |
.[grepl("\\.xls$", .)] | |
# Remove beginning section of links and replace with base url | |
links <- stri_replace(links, "", regex = "^/police/cs/") | |
base <- "http://www.houstontx.gov/police/cs/" | |
new_links <- paste0(base, links) | |
# Set destinations for Excel files | |
dest <- "hou_cr_data" # you may want to change this | |
dir.create(dest) | |
dest_paths <- paste0(dest, stri_replace(links, "", regex = "^xls")) | |
# -- Download the data | |
# !This will take a while | |
# Each file is about 2MB | |
# There are 91 files | |
hou_cr_dwnld <- new_links %>% | |
map2(dest_paths, function(x, y) { | |
download.file(x, y, mode = "wb") | |
}) | |
# -- Import into R | |
# Declare ranges on certain files to avoid fread() errors | |
hou_cr <- list.files(dest, full.names = TRUE) %>% | |
map(function(x) { | |
if (grepl("jul09\\.xls$", x)) { | |
read_xls(x, range = "A1:J12922") | |
} else if (grepl("nov09\\.xls$", x)) { | |
read_xls(x, range = "A1:J11903") | |
} else if (grepl("sep09\\.xls$", x)) { | |
read_xls(x, range = "A1:J12422") | |
} else if (grepl("sep10\\.xls$", x)) { | |
read_xls(x, range = "A1:J10921") | |
} else { | |
read_xls(x) | |
} | |
}) | |
# -- Clean up tibbles | |
# Tibbles have varying column names and number of columns | |
# Reorganize tibbles by number of columns | |
reorg <- function(dat) { | |
map(unique(lens <- lengths(dat)), function(i) { | |
dat[which(lens == i)] | |
}) | |
} | |
# Reorganize (returns nested list of tibbles) | |
hou_cr_reorg <- reorg(hou_cr) | |
# Find unique column name sets | |
nms <- hou_cr_reorg %>% | |
at_depth(2, colnames) %>% | |
at_depth(2, as.matrix) %>% | |
map(function(x) t(reduce(x, cbind))) %>% | |
map(as.data.frame) %>% | |
map(as_tibble) %>% | |
map(distinct) | |
# Select "proper" column names | |
nms_10 <- nms[[1]][1, ] %>% as_vector() %>% as.character() | |
nms_11 <- nms[[2]] %>% as_vector() %>% as.character() | |
nms_9 <- nms[[3]][1,] %>% as_vector() %>% as.character() | |
nms_9[[9]] <- nms_10[[10]] # need to rename `# Offenses` to `# Of Offenses` | |
# Set "proper" column names | |
hou_cr_reorg[1] <- hou_cr_reorg[[1]] %>% | |
map(function(x) setNames(x, nms_10)) %>% | |
list() | |
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>% | |
map(function(x) setNames(x, nms_9)) %>% | |
list() | |
# Remove "anomalous" column X__1 | |
hou_cr_reorg[2] <- hou_cr_reorg[[2]] %>% | |
map(function(x) select(x, -X__1)) %>% | |
list() | |
# Add column "Premise" to tibbles that have only 9 columns | |
hou_cr_reorg[3] <- hou_cr_reorg[[3]] %>% | |
map(function(x) add_column(x, Premise = NA_character_)) %>% | |
list() | |
# Unionize all tibbles in hou_cr | |
hou_cr <- hou_cr_reorg %>% | |
map(function(x) reduce(x, function(...) union_all(...))) %>% | |
reduce(function(...) union_all(...)) | |
# -- Plot | |
library(zoo) | |
library(forcats) | |
library(extrafont) | |
library(grid) | |
library(gridExtra) | |
loadfonts("win") | |
# Filtered data set | |
hou_cr_filtered <- hou_cr %>% | |
filter(as.yearmon(Date) >= 2009.5 & as.yearmon(Date) < 2017) %>% | |
filter(!is.na(as.yearmon(Date))) %>% | |
filter(`Offense Type` != "1") %>% | |
mutate(Hour = if_else(Hour == "24", "00", Hour)) %>% | |
mutate(Hour = stri_replace(Hour, "", regex = "'")) | |
# Summary data | |
hou_cr_summ <- hou_cr_filtered %>% | |
group_by(`Offense Type`, | |
Date = as.yearmon(Date)) %>% | |
summarize(`Total number of offenses` = sum(`# Of Offenses`, na.rm = TRUE)) %>% | |
ungroup() %>% | |
group_by(`Offense Type`) %>% | |
mutate(`Number of offenses` = rollmean(`Total number of offenses`, 6, fill = NA, align = "right")) %>% | |
ungroup() | |
my_colors <- c("#984807", "#e15c39", "#008348", "#4c7093", "#d6a300", "#ce1141", "#8E3694") | |
# - Lines | |
hou_plots$lines <- hou_cr_summ %>% | |
ggplot(aes(Date)) + | |
geom_line(aes(y = `Number of offenses`, group = `Offense Type`), size = 0.75) + | |
geom_line(aes(y = `Total number of offenses`, color = `Offense Type`), size = 0.82) + | |
facet_wrap(~forcats::fct_rev(reorder(`Offense Type`, `Total number of offenses`, sum)), scales = "free_y", ncol = 2) + | |
scale_x_yearmon(breaks = c(2010, 2016), | |
minor_breaks = NULL) + | |
scale_color_manual(values = c("#4c7093", my_green, my_orange, my_purple, my_red, "#d6a300", "#984807")) + | |
guides(color = FALSE) + | |
labs(title = paste0("Crime in Houston from July 2009 to", | |
"\nDecember 2016"), | |
subtitle = paste0("Monthly totals and six-month lagging averages", | |
"\n(black line) show a drop in burglary but a rise", | |
"\nin rape and murder.", | |
"\n", | |
"\n"), | |
caption = "\n ") + | |
theme_minimal() + | |
theme(text = element_text(family = "Open Sans", size = 16), | |
panel.grid.minor = element_blank()) | |
# - Roses | |
# Rose function | |
rose_diagrams <- function(var, clr) { | |
p <- ggplot(filter(hou_cr_Hour_count, `Offense Type` == var)) + | |
geom_bar(aes(as.integer(Hour), n_Hour), fill = clr, color = "black", | |
stat = "identity") + | |
labs(title = paste0(var)) + | |
scale_y_continuous(name = NULL, | |
labels = NULL) + | |
scale_x_continuous(name = NULL, | |
breaks = c(0, 6, 12, 18), | |
labels = c("12:00 am", "6:00 am", | |
"12:00 pm", "6:00 pm")) + | |
coord_polar() + | |
theme_minimal() + | |
theme(text = element_text(family = "Open Sans", size = 11), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
plot.margin = unit(c(0, 0, 0, 0), "cm")) | |
pg <- ggplotGrob(p) | |
# Turn 'panel' clipping to 'off' | |
pg$layout$clip[[which(pg$layout$name == "panel")]] <- "off" | |
return(pg) | |
} | |
# Roses | |
rose_garden <- Map(rose_diagrams, | |
c("Theft", "Burglary", "Auto Theft", "Aggravated Assault", "Robbery", "Rape", "Murder"), | |
plt_colors) | |
grid.arrange(roses <- arrangeGrob(grobs = rose_garden, ncol = 2, padding = unit(-1, "lines"), top = "")) | |
# - Canvas | |
blanky <- ggplot() + | |
geom_blank() + | |
labs(title = paste0("When do crimes most often occur during", | |
"\nthe day?"), | |
subtitle = paste0("Violent crimes occur most often during the evening", | |
"\nand night hours when people are socially active at", | |
"\nbars and restaurants; while burglary occurs during", | |
"\nthe day when people are away from home.", | |
"\n"), | |
caption = "Source: City of Houston; Houston Police Department") + | |
theme_minimal() + | |
theme(text = element_text(family = "Open Sans", size = 16)) | |
# Grobbify and add roses to bg's 'panel' | |
bg <- ggplotGrob(blanky) | |
widths <- bg | |
bg$grobs[[which(bg$layout$name == "panel")]] <- addGrob(bg$grobs[[which(bg$layout$name == "panel")]], roses) | |
# Assign and convert | |
hou_plots$roses <- bg | |
hou_plots$lines <- ggplotGrob(hou_plots$lines) | |
# Get and set heights | |
p_heights <- unit.pmax(hou_plots$lines$heights[2:5], hou_plots$roses$heights[2:5]) | |
hou_plots$lines$heights[2:5] <- as.list(p_heights) | |
hou_plots$roses$heights[2:5] <- as.list(p_heights) | |
# - Print and save | |
png("Houston_Crime_Plot.png", width = 900, height = 751) | |
grid.arrange(hou_plots$lines, hou_plots$roses, ncol = 2) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment