California data breach analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# California data breach analysis | |
# | |
# Author: Matthias Vallentin <vallentin@icir.org> | |
# Copyright (c) 2016 | |
# | |
# To reproduce, please contact me. | |
library(dplyr) | |
library(ggplot2) | |
library(lubridate) | |
library(rvest) | |
library(stringr) | |
library(tidyr) | |
# Helper function to extract a column from a HTML table. | |
column <- function(data, xpath) { | |
data %>% html_node(xpath = xpath) %>% html_text(trim = TRUE) | |
} | |
breach_html <- read_html("https://oag.ca.gov/ecrime/databreach/list") | |
breach_table <- breach_html %>% html_nodes("tbody tr") | |
# Institution names. | |
institution = breach_table %>% column("td[1]/a") | |
# Comma-separated list of breach dates. | |
breached <- breach_table %>% | |
column("td[2]") %>% | |
str_split(", ") | |
# Date when breach was reported. | |
reported = breach_table %>% column("td[3]") %>% mdy | |
# Repetition factor to account for multiple breaches per report. | |
inflate <- sapply(breached, length) | |
breaches <- data.frame( | |
institution = rep(institution, inflate), | |
breached = breached %>% unlist %>% mdy(quiet = TRUE), | |
reported = rep(reported, inflate) | |
) %>% tbl_df | |
# How many unknown breach dates? | |
unknown <- breaches %>% | |
transmute(na = is.na(breached)) %>% | |
filter(na == TRUE) %>% | |
summarize(n()) | |
message("Total reports: ", length(reported)) | |
message("Total breaches: ", nrow(breaches)) | |
message("Unkown breach dates: ", unknown, " (", | |
round(unknown / nrow(breaches) * 100), "%)") | |
# Cumulative breaches, by date reported. | |
plot_cum_rep <- breaches %>% | |
arrange(reported) %>% | |
mutate(cum = seq_along(reported)) %>% | |
ggplot(aes(x = reported, y = cum)) + | |
geom_line() + | |
xlab("Year") + ylab("Breaches") | |
breaches_by_year <- breaches %>% | |
transmute(Year = year(breached)) %>% | |
group_by(Year) %>% | |
summarize(Breaches = n()) | |
reports_by_year <- breaches %>% | |
transmute(Year = year(reported)) %>% | |
group_by(Year) %>% | |
summarize(Reports = n()) | |
# Number of breaches and reports by year (without unknown breaches above). | |
plot_by_year <- left_join(breaches_by_year, reports_by_year) %>% | |
gather(key, value, Breaches, Reports) %>% | |
ggplot(aes(x = Year, y = value, fill = key)) + | |
geom_bar(stat = "identity", position = "dodge") + | |
scale_fill_discrete(name = "") + | |
scale_x_continuous(breaks = 2007:2016, labels = str_pad(7:16, 2, pad = 0)) + | |
xlab("Year") + ylab("Count") | |
# Weekday of breaches/reports. | |
plot_wday <- breaches %>% | |
transmute(Breaches = wday(breached), Reports = wday(reported)) %>% | |
gather(key, value, Breaches, Reports) %>% | |
ggplot(aes(x = value, fill = key)) + | |
geom_bar(position = "dodge") + | |
scale_fill_discrete(name = "") + | |
scale_x_continuous(breaks = 1:7, | |
labels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) + | |
xlab("Weekday") + ylab("Count") | |
# Top-10 most-breached institutions. | |
plot_top10_breached <- breaches %>% | |
group_by(institution) %>% | |
summarize(n = n()) %>% | |
arrange(desc(n)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -n), y = n)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Breaches") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
# Top-10 most-breached institutions reports). | |
plot_top10_reported <- breaches %>% | |
# Exclude AXP/Discover because they also have to report merchant data loss | |
filter(!grepl("AXP|Discover", institution)) %>% | |
group_by(institution) %>% | |
summarize(n = n_distinct(reported)) %>% | |
arrange(desc(n)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -n), y = n)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Reports") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
responsiveness <- breaches %>% | |
mutate(days = (breached %--% reported) / ddays(1), year = year(reported)) | |
# Distribution of time from breach until report. | |
plot_resp_ecdf <- responsiveness %>% | |
ggplot(aes(x = days)) + | |
stat_ecdf() + | |
scale_x_log10(minor_breaks = NULL, | |
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365), | |
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) + | |
xlab("Responsiveness") + | |
ylab("ECDF") | |
# Same as above, but one ECDF per year. | |
plot_resp_ecdf_by_year <- responsiveness %>% | |
mutate(Year = factor(year)) %>% | |
ggplot(aes(x = days, group = Year, color = Year)) + | |
stat_ecdf() + | |
scale_x_log10(minor_breaks = NULL, | |
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365), | |
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) + | |
xlab("Responsiveness") + | |
ylab("ECDF") | |
# Did the industry get quicker at reporting? | |
plot_resp_bar_by_year <- responsiveness %>% | |
group_by(year) %>% | |
summarize(median = median(days, na.rm = TRUE)) %>% | |
ggplot(aes(x = year, y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Median responsiveness") + | |
ylab("Days") | |
# Top-10 best responsiveness. | |
plot_resp_best <- responsiveness %>% | |
group_by(institution) %>% | |
summarize(median = median(days, na.rm = TRUE)) %>% | |
arrange(median) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, median), y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Median Response Time (Days)") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
# Top-10 worst responsiveness. | |
plot_resp_worst <- responsiveness %>% | |
mutate(years = days / 365) %>% | |
group_by(institution) %>% | |
summarize(median = median(years, na.rm = TRUE)) %>% | |
arrange(desc(median)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -median), y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Median Response Time (Years)") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
ggsave("breaches-cumulative-reported.png", plot_cum_rep) | |
ggsave("breaches-by-year.png", plot_by_year) | |
ggsave("breaches-by-weekday.png", plot_wday) | |
ggsave("breaches-top10-most-breached.png", plot_top10_breached) | |
ggsave("breaches-top10-most-reported.png", plot_top10_reported) | |
ggsave("breaches-responsiveness-ecdf.png", plot_resp_ecdf) | |
ggsave("breaches-responsiveness-ecdf-by-year.png", plot_resp_ecdf_by_year) | |
ggsave("breaches-responsiveness-by-year-median.png", plot_resp_bar_by_year) | |
ggsave("breaches-responsiveness-best.png", plot_resp_best) | |
ggsave("breaches-responsiveness-worst.png", plot_resp_worst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment