Skip to content

Instantly share code, notes, and snippets.

@dggoldst
Last active September 30, 2015 21:47
Show Gist options
  • Save dggoldst/a31f6a64426a0ccb5ec3 to your computer and use it in GitHub Desktop.
Save dggoldst/a31f6a64426a0ccb5ec3 to your computer and use it in GitHub Desktop.
#data from http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time
#Ask for these fields
#"DAY_OF_WEEK" (IN UI DayOfWeek)
#"FL_DATE" (FlightDate)
#"CARRIER" (Carrier)
#"ORIGIN_CITY_MARKET_ID" (OriginCityMarketID)
#"ORIGIN" (Origin)
#"CRS_DEP_TIME" (CRSDepTime)
#"DEP_DELAY" (DepDelay)
#"ARR_DELAY" (ArrDelay)
#save files with names in format 01_2013.csv.gz, 02_2013.csv.gz, etc.
#save them in a subdirectory called "database"
library(plyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(dplyr)
setwd("C:/Dropbox/Projects/20141105_OnTimeDeparture/")
#On first run do this
# paths=dir("database",pattern="_2013.csv",full.names=TRUE)
# names(paths)=basename(paths)
# #takes about 4 mins
# df=ldply(paths,read.csv)
# df$X=df$.id=NULL
# names(df)=c("day_of_week","date","carrier","origin_market_id","airport",
# "departure_hour","delay","arr_delay")
#
# save(df,file="2013flights.Rdata")
#takes only a few secs
load(file="2013flights.Rdata")
str(df)
df = df %>%
mutate(
day_of_week=factor(day_of_week,levels=c(1:7,9),
labels=
c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
"Unknown"
)),
month=substr(date,6,7),
datenum=substr(date,9,10),
departure_hour = round(departure_hour/100,0),
delay=ifelse(delay<0,0,delay),
arr_delay=ifelse(arr_delay<0,0,arr_delay)) %>%
filter(departure_hour > 5 & departure_hour< 24)
###Arrival and departure delays as fun of departure time
plot_data = df %>%
gather(delay_type,newdelay,delay:arr_delay) %>%
mutate(delay_type = ifelse(delay_type=="delay","Departure Delay","Arrival Delay")) %>%
group_by(departure_hour,delay_type) %>%
dplyr::summarise(mu=mean(newdelay,na.rm=TRUE),
se=sqrt(var(newdelay,na.rm=TRUE)/length(na.omit(newdelay))),
obs=length(na.omit(newdelay)))
#MS: W00t!
p=ggplot(plot_data,aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=delay_type,color=delay_type)) +
geom_line() +
geom_point() +
geom_errorbar(width=.33) +
scale_x_continuous(breaks=seq(6,23)) +
labs(x="Hour of Day",y="Average Delay (Minutes)",title="Flight Delays by Departure Time") +
theme(legend.position="bottom") +
scale_color_discrete(name="Delay Type")
p
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.pdf",width=6,height=4)
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.png",width=6,height=4)
####For every day of the year
plot_data = df %>%
group_by(month, datenum) %>%
dplyr::summarise(mu=median(delay,na.rm=TRUE),
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))),
obs=length(na.omit(delay)))
p=ggplot(plot_data,aes(x=datenum,y=mu,min=mu-se,max=mu+se,group=month)) +
geom_line() +
geom_point() +
scale_y_continuous(breaks=c(0,10)) +
coord_cartesian(ylim=c(-4,16)) +
labs(x="Day of month",y="Median Departure Delay (Minutes)",title="Median Flight Delays by Departure Date") +
theme(legend.position="bottom") +
facet_grid(month ~.) +
theme_bw()
p
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.pdf",width=8,height=6)
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.png",width=8,height=6)
###JUST 10 busiest airports
#en.wikipedia.org/wiki/List_of_the_busiest_airports_in_the_United_States
#Busiest_US_airports_by_total_passenger_boardings
plot_data = df %>%
filter(airport %in% c(
"ATL",
"LAX",
"ORD",
"DFW",
"DEN",
"JFK",
"SFO",
"CLT",
"LAS",
"PHX"
)) %>%
group_by(departure_hour,airport) %>%
dplyr::summarise(mu=mean(delay,na.rm=TRUE),
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))),
obs=length(na.omit(delay))) %>%
mutate(mu=ifelse((mu-0<.001),NA,mu),
airport=factor(airport,levels=c(
"ATL",
"LAX",
"ORD",
"DFW",
"DEN",
"JFK",
"SFO",
"CLT",
"LAS",
"PHX")))
p=ggplot(subset(plot_data,as.numeric(airport) <=5),
aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=airport,color=airport,shape=airport)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=seq(5,23)) +
labs(x="Hour of Day",y="Average Departure Delay (Minutes)",title="Top Five Most Popular Airports") +
theme(legend.position="bottom") +
scale_color_discrete(name="Airport") +
scale_shape_discrete(name="Airport")
p
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport_Top5.png",width=6,height=4)
p=ggplot(subset(plot_data,as.numeric(airport) >5),
aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=airport,color=airport,shape=airport)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=seq(5,23)) +
labs(x="Hour of Day",y="Average Departure Delay (Minutes)",title="Airports Six through Ten") +
theme(legend.position="bottom") +
scale_color_discrete(name="Airport") +
scale_shape_discrete(name="Airport")
p
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport_6to10.png",width=6,height=4)
###Just the 95% and 75% quantiles
plot_data = df %>%
group_by(departure_hour) %>%
dplyr::summarise(Quantile_95=quantile(delay,.95,na.rm=TRUE),
Quantile_75=quantile(delay,.75,na.rm=TRUE),
obs=length(na.omit(delay)))
plot_data2 = plot_data %>%
gather(variable, value, Quantile_75:Quantile_95) %>%
mutate(variable=factor(variable,levels=c("Quantile_95","Quantile_75")))
p=ggplot(plot_data2,aes(x=departure_hour,y=value,group=variable,color=variable)) +
geom_line() +
scale_x_continuous(breaks=seq(5,23)) +
labs(x="Hour of Day",y="Departure Delay (Minutes)",title="95th and 75th Percentiles of Departure Delays") +
scale_color_discrete(name="Quantile") +
theme(legend.position="bottom")
p
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.pdf",width=6,height=4)
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.png",width=6,height=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment