Last active
May 4, 2024 08:11
-
-
Save hrbrmstr/e73c4130902e22bbbf863c692bc2461b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
airline | voluntary_denied | involuntary_denied | enplaned_ct | involuntary_db_per_10k | year | |
---|---|---|---|---|---|---|
Hawaiian Airlines | 326 | 49 | 10824495 | 0.05 | 2016 | |
Delta Air Lines | 129825 | 1238 | 129281098 | 0.1 | 2016 | |
Virgin America | 2375 | 94 | 7945329 | 0.12 | 2016 | |
Alaska Airlines | 6806 | 931 | 23390900 | 0.4 | 2016 | |
United Airlines | 62895 | 3765 | 86836527 | 0.43 | 2016 | |
Spirit Airlines | 10444 | 1117 | 19418650 | 0.58 | 2016 | |
Frontier Airlines | 2096 | 851 | 14666332 | 0.58 | 2016 | |
American Airlines | 54259 | 8312 | 130894653 | 0.64 | 2016 | |
Jetblue Airways | 1705 | 3176 | 34710003 | 0.92 | 2016 | |
Skywest Airlines | 41476 | 2935 | 29986918 | 0.98 | 2016 | |
Southwest Airlines | 88628 | 14979 | 150655354 | 0.99 | 2016 | |
Expressjet Airlines | 33590 | 3182 | 21139038 | 1.51 | 2016 | |
Jetblue Airways | 1841 | 73 | 31949251 | 0.02 | 2015 | |
Hawaiian Airlines | 358 | 29 | 10462344 | 0.03 | 2015 | |
Virgin America | 1722 | 80 | 6928805 | 0.12 | 2015 | |
Delta Air Lines | 145406 | 1938 | 125044855 | 0.16 | 2015 | |
Spirit Airlines | 6589 | 496 | 16010164 | 0.31 | 2015 | |
Alaska Airlines | 5412 | 740 | 22095126 | 0.33 | 2015 | |
United Airlines | 81390 | 6317 | 82081914 | 0.77 | 2015 | |
American Airlines | 50317 | 7504 | 97091951 | 0.77 | 2015 | |
Frontier Airlines | 2744 | 1232 | 12343540 | 1 | 2015 | |
Southwest Airlines | 96513 | 15608 | 143932752 | 1.08 | 2015 | |
Skywest Airlines | 51829 | 5079 | 28562760 | 1.78 | 2015 | |
Expressjet Airlines | 42933 | 4608 | 24736601 | 1.86 | 2015 | |
Envoy Air | 18125 | 2792 | 11901028 | 2.35 | 2015 | |
Virgin America | 910 | 57 | 6438023 | 0.09 | 2014 | |
Hawaiian Airlines | 366 | 116 | 10084811 | 0.12 | 2014 | |
Jetblue Airways | 2006 | 650 | 29264332 | 0.22 | 2014 | |
Delta Air Lines | 107706 | 4052 | 115737180 | 0.35 | 2014 | |
American Airlines | 60924 | 7471 | 135748581 | 0.55 | 2014 | |
Alaska Airlines | 4176 | 864 | 19838878 | 0.44 | 2014 | |
Southwest Airlines | 88921 | 13899 | 125381374 | 1.11 | 2014 | |
United Airlines | 64968 | 9078 | 77317281 | 1.17 | 2014 | |
Frontier Airlines | 3864 | 1616 | 11787602 | 1.37 | 2014 | |
Envoy Air | 18615 | 2501 | 15441723 | 1.62 | 2014 | |
Expressjet Airlines | 55525 | 7961 | 29344974 | 2.71 | 2014 | |
Skywest Airlines | 42446 | 7170 | 26420593 | 2.71 | 2014 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' --- | |
#' title: "DisembaRking" | |
#' author: "@hrbrmstr" | |
#' date: "" | |
#' output: | |
#' html_document: | |
#' code_download: true | |
#' --- | |
#+ include=FALSE | |
knitr::opts_chunk$set(message=FALSE, warning=FALSE) | |
#+ letsgo | |
library(rvest) | |
library(stringi) | |
library(pdftools) | |
library(hrbrthemes) | |
library(tidyverse) | |
#' some URLs generate infinite redirection loops so be safe out there | |
safe_read_html <- safely(read_html) | |
#' grab the individual page URLs for each month available in each year | |
c("https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2017", | |
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2016", | |
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2015") %>% | |
map(function(x) { | |
read_html(x) %>% | |
html_nodes("a[href*='air-travel-consumer-report']") %>% | |
html_attr('href') | |
}) %>% | |
flatten_chr() %>% | |
discard(stri_detect_regex, "feedback|/air-travel-consumer-reports") %>% # filter out URLs we don't need | |
sprintf("https://www.transportation.gov%s", .) -> main_urls # make them useful | |
#' now, read in all the individual pages. | |
#' do this separate from URL grabbing above and the PDF URL extraction | |
#' below just to be even safer. | |
map(main_urls, safe_read_html) -> pages | |
#' URLs that generate said redirection loops will not have a valid | |
#' result so ignor ethem and find the URLs for the monthly reports | |
discard(pages, ~is.null(.$result)) %>% | |
map("result") %>% | |
map(~html_nodes(., "a[href*='pdf']") %>% | |
html_attr('href') %>% | |
keep(stri_detect_fixed, "ATCR")) %>% | |
flatten_chr() -> pdf_urls | |
#' download them, being kind to the DoT server and not re-downloading | |
#' anything we've successfully downloaded already. I really wish this | |
#' was built-in functionality to download.file() | |
dir.create("atcr_pdfs") | |
walk(pdf_urls, ~if (!file.exists(file.path("atcr_pdfs", basename(.)))) | |
download.file(., file.path("atcr_pdfs", basename(.)))) | |
#' read in each PDF; find the pages with the tables we need to scrape; | |
#' enable the text table to be read with read.table() and save the | |
#' results | |
c("2017MarchATCR.pdf", "2016MarchATCR_2.pdf", "2015MarchATCR_1.pdf") %>% | |
file.path("atcr_pdfs", .) %>% | |
map(pdf_text) %>% | |
map(~keep(.x, stri_detect_fixed, "PASSENGERS DENIED BOARDING")[[2]]) %>% | |
map(stri_split_lines) %>% | |
map(flatten_chr) %>% | |
map(function(x) { | |
y <- which(stri_detect_regex(x, "Rank|RANK|TOTAL")) | |
grep("^\ +[[:digit:]]", x[y[1]:y[2]], value=TRUE) %>% | |
stri_trim() %>% | |
stri_replace_all_regex("([[:alpha:]])\\*+", "$1") %>% | |
stri_replace_all_regex(" ([[:alpha:]])", "_$1") %>% | |
paste0(collapse="\n") %>% | |
read.table(text=., header=FALSE, stringsAsFactors=FALSE) | |
}) -> denied | |
denied | |
map2_df(2016:2014, denied, ~{ | |
.y$year <- .x | |
set_names(.y[,c(1:6,11)], | |
c("rank", "airline", "voluntary_denied", "involuntary_denied", | |
"enplaned_ct", "involuntary_db_per_10k", "year")) %>% | |
mutate(airline = stri_trans_totitle(stri_trim(stri_replace_all_fixed(airline, "_", " ")))) %>% | |
readr::type_convert() %>% | |
tbl_df() | |
}) %>% | |
select(-rank) -> denied | |
glimpse(denied) | |
denied | |
select(denied, airline, year, involuntary_db_per_10k) %>% | |
group_by(airline) %>% | |
mutate(yr_ct = n()) %>% | |
ungroup() %>% | |
filter(yr_ct == 3) %>% | |
select(-yr_ct) %>% | |
mutate(year = factor(year, rev(c(max(year)+1, unique(year))))) -> plot_df | |
str(plot_df$year) | |
update_geom_font_defaults(font_rc, size = 3) | |
#+ fig.width=7.5, fig.height=11 | |
ggplot() + | |
geom_line(data = plot_df, aes(year, involuntary_db_per_10k, group=airline, colour=airline)) + | |
geom_text(data = filter(plot_df, year=='2016') %>% mutate(lbl = sprintf("%s (%s)", airline, involuntary_db_per_10k)), | |
aes(x=year, y=involuntary_db_per_10k, label=lbl, colour=airline), hjust=0, | |
nudge_y=c(0,0,0,0,0,0,0,0,-0.0005,0.03,0), nudge_x=0.015) + | |
scale_x_discrete(expand=c(0,0), labels=c(2014:2016, ""), drop=FALSE) + | |
scale_y_continuous(trans="log1p") + | |
ggthemes::scale_color_tableau() + | |
labs(x=NULL, y=NULL, | |
title="Involuntary Disembark Rate Per 10K Passengers", | |
subtitle="Y-axis log scale; Only included airlines with 3-year span data", | |
caption="Source: U.S. DoT Air Travel Consumer Reports <https://www.transportation.gov/airconsumer/air-travel-consumer-reports>") + | |
theme_ipsum_rc(grid="X") + | |
theme(plot.caption=element_text(hjust=0)) + | |
theme(legend.position="none") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment