anirudhjayaraman/group_by_miscellaneous.R

## group_by_miscellaneous.R
# Combine group_by with mutate-----

# First, discard flights whose arrival delay equals NA. Next, create a by-carrier
# summary with a single variable: p_delay, the proportion of flights which are
# delayed at arrival. Next, create a new variable rank in the summary which is a
# rank according to p_delay. Finally, arrange the observations by this new rank
hflights %>%
  filter(!is.na(ArrDelay)) %>%
  group_by(UniqueCarrier) %>%
  summarise(p_delay = sum(ArrDelay >0)/n()) %>%
  mutate(rank = rank(p_delay)) %>%
  arrange(rank)

# n a similar fashion, keep flights that are delayed (ArrDelay > 0 and not NA).
# Next, create a by-carrier summary with a single variable: avg, the average delay
# of the delayed flights. Again add a new variable rank to the summary according to
# avg. Finally, arrange by this rank variable.
hflights %>%
  filter(!is.na(ArrDelay), ArrDelay > 0) %>%
  group_by(UniqueCarrier) %>%
  summarise(avg = mean(ArrDelay)) %>%
  mutate(rank = rank(avg)) %>%
  arrange(rank)

# Advanced group_by exercises-------------------------------------------------------

# Which plane (by tail number) flew out of Houston the most times? How many times?
# Name the column with this frequency n. Assign the result to adv1. To answer this
# question precisely, you will have to filter() as a final step to end up with only
# a single observation in adv1.
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1
adv1 <- hflights %>%
  group_by(TailNum) %>%
  summarise(n = n()) %>%
  filter(n == max(n))

# How many airplanes only flew to one destination from Houston? adv2
# How many airplanes only flew to one destination from Houston?
# Save the resulting dataset in adv2, that contains only a single column,
# named nplanes and a single row.
adv2 <- hflights %>%
  group_by(TailNum) %>%
  summarise(n_dest = n_distinct(Dest)) %>%
  filter(n_dest == 1) %>%
  summarise(nplanes = n())

# Find the most visited destination for each carrier and save your solution to adv3.
# Your solution should contain four columns:
# UniqueCarrier and Dest,
# n, how often a carrier visited a particular destination,
# rank, how each destination ranks per carrier. rank should be 1 for every row,
# as you want to find the most visited destination for each carrier.

adv3 <- hflights %>%
  group_by(UniqueCarrier, Dest) %>%
  summarise(n = n()) %>%
  mutate(rank = rank(desc(n))) %>%
  filter(rank == 1)

# Find the carrier that travels to each destination the most: adv4
# For each destination, find the carrier that travels to that destination the most.
# Store the result in adv4. Again, your solution should contain 4 columns:
# Dest, UniqueCarrier, n and rank.

adv4 <- hflights %>%
  group_by(Dest, UniqueCarrier) %>%
  summarise(n = n()) %>%
  mutate(rank = rank(desc(n))) %>%
  filter(rank == 1)
	# Combine group_by with mutate-----

	# First, discard flights whose arrival delay equals NA. Next, create a by-carrier
	# summary with a single variable: p_delay, the proportion of flights which are
	# delayed at arrival. Next, create a new variable rank in the summary which is a
	# rank according to p_delay. Finally, arrange the observations by this new rank
	hflights %>%
	filter(!is.na(ArrDelay)) %>%
	group_by(UniqueCarrier) %>%
	summarise(p_delay = sum(ArrDelay >0)/n()) %>%
	mutate(rank = rank(p_delay)) %>%
	arrange(rank)

	# n a similar fashion, keep flights that are delayed (ArrDelay > 0 and not NA).
	# Next, create a by-carrier summary with a single variable: avg, the average delay
	# of the delayed flights. Again add a new variable rank to the summary according to
	# avg. Finally, arrange by this rank variable.
	hflights %>%
	filter(!is.na(ArrDelay), ArrDelay > 0) %>%
	group_by(UniqueCarrier) %>%
	summarise(avg = mean(ArrDelay)) %>%
	mutate(rank = rank(avg)) %>%
	arrange(rank)

	# Advanced group_by exercises-------------------------------------------------------

	# Which plane (by tail number) flew out of Houston the most times? How many times?
	# Name the column with this frequency n. Assign the result to adv1. To answer this
	# question precisely, you will have to filter() as a final step to end up with only
	# a single observation in adv1.
	# Which plane (by tail number) flew out of Houston the most times? How many times? adv1
	adv1 <- hflights %>%
	group_by(TailNum) %>%
	summarise(n = n()) %>%
	filter(n == max(n))

	# How many airplanes only flew to one destination from Houston? adv2
	# How many airplanes only flew to one destination from Houston?
	# Save the resulting dataset in adv2, that contains only a single column,
	# named nplanes and a single row.
	adv2 <- hflights %>%
	group_by(TailNum) %>%
	summarise(n_dest = n_distinct(Dest)) %>%
	filter(n_dest == 1) %>%
	summarise(nplanes = n())

	# Find the most visited destination for each carrier and save your solution to adv3.
	# Your solution should contain four columns:
	# UniqueCarrier and Dest,
	# n, how often a carrier visited a particular destination,
	# rank, how each destination ranks per carrier. rank should be 1 for every row,
	# as you want to find the most visited destination for each carrier.

	adv3 <- hflights %>%
	group_by(UniqueCarrier, Dest) %>%
	summarise(n = n()) %>%
	mutate(rank = rank(desc(n))) %>%
	filter(rank == 1)

	# Find the carrier that travels to each destination the most: adv4
	# For each destination, find the carrier that travels to that destination the most.
	# Store the result in adv4. Again, your solution should contain 4 columns:
	# Dest, UniqueCarrier, n and rank.

	adv4 <- hflights %>%
	group_by(Dest, UniqueCarrier) %>%
	summarise(n = n()) %>%
	mutate(rank = rank(desc(n))) %>%
	filter(rank == 1)