Skip to content

Instantly share code, notes, and snippets.

View anirudhjayaraman's full-sized avatar
🏠
Working from home

Anirudh Jayaraman anirudhjayaraman

🏠
Working from home
View GitHub Profile
# Combine group_by with mutate-----
# First, discard flights whose arrival delay equals NA. Next, create a by-carrier
# summary with a single variable: p_delay, the proportion of flights which are
# delayed at arrival. Next, create a new variable rank in the summary which is a
# rank according to p_delay. Finally, arrange the observations by this new rank
hflights %>%
filter(!is.na(ArrDelay)) %>%
group_by(UniqueCarrier) %>%
summarise(p_delay = sum(ArrDelay >0)/n()) %>%
@anirudhjayaraman
anirudhjayaraman / group_by.R
Created December 19, 2015 09:34
group_by illustrative examples
# group_by() -------------------------------------------------------------------------
# Generate a per-carrier summary of hflights with the following variables: n_flights,
# the number of flights flown by the carrier; n_canc, the number of cancelled flights;
# p_canc, the percentage of cancelled flights; avg_delay, the average arrival delay of
# flights whose delay does not equal NA. Next, order the carriers in the summary from
# low to high by their average arrival delay. Use percentage of flights cancelled to
# break any ties. Which airline scores best based on these statistics?
hflights %>%
@anirudhjayaraman
anirudhjayaraman / pipingOperator.r
Created December 19, 2015 07:47
Piping Operator %>% in dplyr
# %>% OPERATOR ----------------------------------------------------------------------
# with %>% operator
hflights %>%
mutate(diff = TaxiOut - TaxiIn) %>%
filter(!is.na(diff)) %>%
summarise(avg = mean(diff))
# without %>% operator
# arguments get further and further apart
select(arrange(filter(hflights, DepDelay > 360), TaxiIn + TaxiOut), TailNum)
# Generate summarizing statistics for hflights
summarise(hflights, n_obs = n(), n_carrier = n_distinct(UniqueCarrier),
n_dest = n_distinct(Dest), dest100 = nth(Dest, 100))
# Filter hflights to keep all American Airline flights: aa
aa <- filter(hflights, UniqueCarrier == "American")
# Generate summarizing statistics for aa
summarise(aa, n_flights = n(), n_canc = sum(Cancelled),
p_canc = 100*(n_canc/n_flights), avg_delay = mean(ArrDelay, na.rm = TRUE))
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
# Print out a summary with variable max_div
summarize(filter(hflights, Diverted == 1), max_div = max(Distance))
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
# Arrange dtc by departure delays
arrange(dtc, DepDelay)
# Arrange dtc so that cancellation reasons are grouped
arrange(dtc, CancellationCode)
# Arrange dtc according to carrier and departure delays
# Summarizing Exercise
# Select the flights that had JFK as their destination: c1
c1 <- filter(hflights, Dest == 'JFK')
# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep = "-"))
# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)
# Combining tests using boolean operators
# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime < 500 | ArrTime > 2200 )
# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay > 0 & ArrDelay < 0)
# All cancelled weekend flights
filter(hflights, DayOfWeek %in% c(6,7) & Cancelled == 1)
# Print out all flights in hflights that traveled 3000 or more miles
filter(hflights, Distance > 3000)
# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c('JetBlue', 'Southwest', 'Delta'))
# All flights where taxiing took longer than flying
filter(hflights, TaxiIn + TaxiOut > AirTime)