Skip to content

Instantly share code, notes, and snippets.

@mavam
Last active April 26, 2016 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mavam/f1467d48d5cf8a460b57 to your computer and use it in GitHub Desktop.
Save mavam/f1467d48d5cf8a460b57 to your computer and use it in GitHub Desktop.
California data breach analysis
# California data breach analysis
#
# Author: Matthias Vallentin <vallentin@icir.org>
# Copyright (c) 2016
#
# To reproduce, please contact me.
library(dplyr)
library(ggplot2)
library(lubridate)
library(rvest)
library(stringr)
library(tidyr)
# Helper function to extract a column from a HTML table.
column <- function(data, xpath) {
data %>% html_node(xpath = xpath) %>% html_text(trim = TRUE)
}
breach_html <- read_html("https://oag.ca.gov/ecrime/databreach/list")
breach_table <- breach_html %>% html_nodes("tbody tr")
# Institution names.
institution = breach_table %>% column("td[1]/a")
# Comma-separated list of breach dates.
breached <- breach_table %>%
column("td[2]") %>%
str_split(", ")
# Date when breach was reported.
reported = breach_table %>% column("td[3]") %>% mdy
# Repetition factor to account for multiple breaches per report.
inflate <- sapply(breached, length)
breaches <- data.frame(
institution = rep(institution, inflate),
breached = breached %>% unlist %>% mdy(quiet = TRUE),
reported = rep(reported, inflate)
) %>% tbl_df
# How many unknown breach dates?
unknown <- breaches %>%
transmute(na = is.na(breached)) %>%
filter(na == TRUE) %>%
summarize(n())
message("Total reports: ", length(reported))
message("Total breaches: ", nrow(breaches))
message("Unkown breach dates: ", unknown, " (",
round(unknown / nrow(breaches) * 100), "%)")
# Cumulative breaches, by date reported.
plot_cum_rep <- breaches %>%
arrange(reported) %>%
mutate(cum = seq_along(reported)) %>%
ggplot(aes(x = reported, y = cum)) +
geom_line() +
xlab("Year") + ylab("Breaches")
breaches_by_year <- breaches %>%
transmute(Year = year(breached)) %>%
group_by(Year) %>%
summarize(Breaches = n())
reports_by_year <- breaches %>%
transmute(Year = year(reported)) %>%
group_by(Year) %>%
summarize(Reports = n())
# Number of breaches and reports by year (without unknown breaches above).
plot_by_year <- left_join(breaches_by_year, reports_by_year) %>%
gather(key, value, Breaches, Reports) %>%
ggplot(aes(x = Year, y = value, fill = key)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_discrete(name = "") +
scale_x_continuous(breaks = 2007:2016, labels = str_pad(7:16, 2, pad = 0)) +
xlab("Year") + ylab("Count")
# Weekday of breaches/reports.
plot_wday <- breaches %>%
transmute(Breaches = wday(breached), Reports = wday(reported)) %>%
gather(key, value, Breaches, Reports) %>%
ggplot(aes(x = value, fill = key)) +
geom_bar(position = "dodge") +
scale_fill_discrete(name = "") +
scale_x_continuous(breaks = 1:7,
labels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) +
xlab("Weekday") + ylab("Count")
# Top-10 most-breached institutions.
plot_top10_breached <- breaches %>%
group_by(institution) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(x = reorder(institution, -n), y = n)) +
geom_bar(stat = "identity") +
xlab("Institution") + ylab("Breaches") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
# Top-10 most-breached institutions reports).
plot_top10_reported <- breaches %>%
# Exclude AXP/Discover because they also have to report merchant data loss
filter(!grepl("AXP|Discover", institution)) %>%
group_by(institution) %>%
summarize(n = n_distinct(reported)) %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(x = reorder(institution, -n), y = n)) +
geom_bar(stat = "identity") +
xlab("Institution") + ylab("Reports") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
responsiveness <- breaches %>%
mutate(days = (breached %--% reported) / ddays(1), year = year(reported))
# Distribution of time from breach until report.
plot_resp_ecdf <- responsiveness %>%
ggplot(aes(x = days)) +
stat_ecdf() +
scale_x_log10(minor_breaks = NULL,
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365),
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) +
xlab("Responsiveness") +
ylab("ECDF")
# Same as above, but one ECDF per year.
plot_resp_ecdf_by_year <- responsiveness %>%
mutate(Year = factor(year)) %>%
ggplot(aes(x = days, group = Year, color = Year)) +
stat_ecdf() +
scale_x_log10(minor_breaks = NULL,
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365),
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) +
xlab("Responsiveness") +
ylab("ECDF")
# Did the industry get quicker at reporting?
plot_resp_bar_by_year <- responsiveness %>%
group_by(year) %>%
summarize(median = median(days, na.rm = TRUE)) %>%
ggplot(aes(x = year, y = median)) +
geom_bar(stat = "identity") +
xlab("Median responsiveness") +
ylab("Days")
# Top-10 best responsiveness.
plot_resp_best <- responsiveness %>%
group_by(institution) %>%
summarize(median = median(days, na.rm = TRUE)) %>%
arrange(median) %>%
head(10) %>%
ggplot(aes(x = reorder(institution, median), y = median)) +
geom_bar(stat = "identity") +
xlab("Institution") + ylab("Median Response Time (Days)") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
# Top-10 worst responsiveness.
plot_resp_worst <- responsiveness %>%
mutate(years = days / 365) %>%
group_by(institution) %>%
summarize(median = median(years, na.rm = TRUE)) %>%
arrange(desc(median)) %>%
head(10) %>%
ggplot(aes(x = reorder(institution, -median), y = median)) +
geom_bar(stat = "identity") +
xlab("Institution") + ylab("Median Response Time (Years)") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))
ggsave("breaches-cumulative-reported.png", plot_cum_rep)
ggsave("breaches-by-year.png", plot_by_year)
ggsave("breaches-by-weekday.png", plot_wday)
ggsave("breaches-top10-most-breached.png", plot_top10_breached)
ggsave("breaches-top10-most-reported.png", plot_top10_reported)
ggsave("breaches-responsiveness-ecdf.png", plot_resp_ecdf)
ggsave("breaches-responsiveness-ecdf-by-year.png", plot_resp_ecdf_by_year)
ggsave("breaches-responsiveness-by-year-median.png", plot_resp_bar_by_year)
ggsave("breaches-responsiveness-best.png", plot_resp_best)
ggsave("breaches-responsiveness-worst.png", plot_resp_worst)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment