Skip to content

Instantly share code, notes, and snippets.

@smc-dta
Forked from hrbrmstr/denied.csv
Created April 16, 2017 05:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smc-dta/38a7d33e79543212780a99e48afb91fc to your computer and use it in GitHub Desktop.
Save smc-dta/38a7d33e79543212780a99e48afb91fc to your computer and use it in GitHub Desktop.
airline voluntary_denied involuntary_denied enplaned_ct involuntary_db_per_10k year
Hawaiian Airlines 326 49 10824495 0.05 2016
Delta Air Lines 129825 1238 129281098 0.1 2016
Virgin America 2375 94 7945329 0.12 2016
Alaska Airlines 6806 931 23390900 0.4 2016
United Airlines 62895 3765 86836527 0.43 2016
Spirit Airlines 10444 1117 19418650 0.58 2016
Frontier Airlines 2096 851 14666332 0.58 2016
American Airlines 54259 8312 130894653 0.64 2016
Jetblue Airways 1705 3176 34710003 0.92 2016
Skywest Airlines 41476 2935 29986918 0.98 2016
Southwest Airlines 88628 14979 150655354 0.99 2016
Expressjet Airlines 33590 3182 21139038 1.51 2016
Jetblue Airways 1841 73 31949251 0.02 2015
Hawaiian Airlines 358 29 10462344 0.03 2015
Virgin America 1722 80 6928805 0.12 2015
Delta Air Lines 145406 1938 125044855 0.16 2015
Spirit Airlines 6589 496 16010164 0.31 2015
Alaska Airlines 5412 740 22095126 0.33 2015
United Airlines 81390 6317 82081914 0.77 2015
American Airlines 50317 7504 97091951 0.77 2015
Frontier Airlines 2744 1232 12343540 1 2015
Southwest Airlines 96513 15608 143932752 1.08 2015
Skywest Airlines 51829 5079 28562760 1.78 2015
Expressjet Airlines 42933 4608 24736601 1.86 2015
Envoy Air 18125 2792 11901028 2.35 2015
Virgin America 910 57 6438023 0.09 2014
Hawaiian Airlines 366 116 10084811 0.12 2014
Jetblue Airways 2006 650 29264332 0.22 2014
Delta Air Lines 107706 4052 115737180 0.35 2014
American Airlines 60924 7471 135748581 0.55 2014
Alaska Airlines 4176 864 19838878 0.44 2014
Southwest Airlines 88921 13899 125381374 1.11 2014
United Airlines 64968 9078 77317281 1.17 2014
Frontier Airlines 3864 1616 11787602 1.37 2014
Envoy Air 18615 2501 15441723 1.62 2014
Expressjet Airlines 55525 7961 29344974 2.71 2014
Skywest Airlines 42446 7170 26420593 2.71 2014
#' ---
#' title: "DisembaRking"
#' author: "@hrbrmstr"
#' date: ""
#' output:
#' html_document:
#' code_download: true
#' ---
#+ include=FALSE
knitr::opts_chunk$set(message=FALSE, warning=FALSE)
#+ letsgo
library(rvest)
library(stringi)
library(pdftools)
library(hrbrthemes)
library(tidyverse)
#' some URLs generate infinite redirection loops so be safe out there
safe_read_html <- safely(read_html)
#' grab the individual page URLs for each month available in each year
c("https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2017",
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2016",
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2015") %>%
map(function(x) {
read_html(x) %>%
html_nodes("a[href*='air-travel-consumer-report']") %>%
html_attr('href')
}) %>%
flatten_chr() %>%
discard(stri_detect_regex, "feedback|/air-travel-consumer-reports") %>% # filter out URLs we don't need
sprintf("https://www.transportation.gov%s", .) -> main_urls # make them useful
#' now, read in all the individual pages.
#' do this separate from URL grabbing above and the PDF URL extraction
#' below just to be even safer.
map(main_urls, safe_read_html) -> pages
#' URLs that generate said redirection loops will not have a valid
#' result so ignor ethem and find the URLs for the monthly reports
discard(pages, ~is.null(.$result)) %>%
map("result") %>%
map(~html_nodes(., "a[href*='pdf']") %>%
html_attr('href') %>%
keep(stri_detect_fixed, "ATCR")) %>%
flatten_chr() -> pdf_urls
#' download them, being kind to the DoT server and not re-downloading
#' anything we've successfully downloaded already. I really wish this
#' was built-in functionality to download.file()
dir.create("atcr_pdfs")
walk(pdf_urls, ~if (!file.exists(file.path("atcr_pdfs", basename(.))))
download.file(., file.path("atcr_pdfs", basename(.))))
#' read in each PDF; find the pages with the tables we need to scrape;
#' enable the text table to be read with read.table() and save the
#' results
c("2017MarchATCR.pdf", "2016MarchATCR_2.pdf", "2015MarchATCR_1.pdf") %>%
file.path("atcr_pdfs", .) %>%
map(pdf_text) %>%
map(~keep(.x, stri_detect_fixed, "PASSENGERS DENIED BOARDING")[[2]]) %>%
map(stri_split_lines) %>%
map(flatten_chr) %>%
map(function(x) {
y <- which(stri_detect_regex(x, "Rank|RANK|TOTAL"))
grep("^\ +[[:digit:]]", x[y[1]:y[2]], value=TRUE) %>%
stri_trim() %>%
stri_replace_all_regex("([[:alpha:]])\\*+", "$1") %>%
stri_replace_all_regex(" ([[:alpha:]])", "_$1") %>%
paste0(collapse="\n") %>%
read.table(text=., header=FALSE, stringsAsFactors=FALSE)
}) -> denied
denied
map2_df(2016:2014, denied, ~{
.y$year <- .x
set_names(.y[,c(1:6,11)],
c("rank", "airline", "voluntary_denied", "involuntary_denied",
"enplaned_ct", "involuntary_db_per_10k", "year")) %>%
mutate(airline = stri_trans_totitle(stri_trim(stri_replace_all_fixed(airline, "_", " ")))) %>%
readr::type_convert() %>%
tbl_df()
}) %>%
select(-rank) -> denied
glimpse(denied)
denied
select(denied, airline, year, involuntary_db_per_10k) %>%
group_by(airline) %>%
mutate(yr_ct = n()) %>%
ungroup() %>%
filter(yr_ct == 3) %>%
select(-yr_ct) %>%
mutate(year = factor(year, rev(c(max(year)+1, unique(year))))) -> plot_df
str(plot_df$year)
update_geom_font_defaults(font_rc, size = 3)
#+ fig.width=7.5, fig.height=11
ggplot() +
geom_line(data = plot_df, aes(year, involuntary_db_per_10k, group=airline, colour=airline)) +
geom_text(data = filter(plot_df, year=='2016') %>% mutate(lbl = sprintf("%s (%s)", airline, involuntary_db_per_10k)),
aes(x=year, y=involuntary_db_per_10k, label=lbl, colour=airline), hjust=0,
nudge_y=c(0,0,0,0,0,0,0,0,-0.0005,0.03,0), nudge_x=0.015) +
scale_x_discrete(expand=c(0,0), labels=c(2014:2016, ""), drop=FALSE) +
scale_y_continuous(trans="log1p") +
ggthemes::scale_color_tableau() +
labs(x=NULL, y=NULL,
title="Involuntary Disembark Rate Per 10K Passengers",
subtitle="Y-axis log scale; Only included airlines with 3-year span data",
caption="Source: U.S. DoT Air Travel Consumer Reports <https://www.transportation.gov/airconsumer/air-travel-consumer-reports>") +
theme_ipsum_rc(grid="X") +
theme(plot.caption=element_text(hjust=0)) +
theme(legend.position="none")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment