tejseth/gist:947f468661d91ca8d69e2e8d7f1daea5

## gistfile1.txt
#install.packages("tidyverse")
#install.packages("forcats")
#install.packages("hrbrthemes")
#install.packages("viridis")
#install.packages("ggrepel")

#Take out the '#' when installing the packages. I only included that in case you've already installed them.
#Tidyverse is the main one you need and the others are just for design.

library(tidyverse)
library(forcats)
library(hrbrthemes)
library(viridis)
library(ggrepel)
#You have to load the packages every time you fire up R. It takes a couple seconds

#After importing the data set by clicking on "Enviroment" and clicking "Import Dataset" pick "From Text (base)", set headers to yes
#Make sure to rename the data set to "mogo_full_trip_list" if you want to follow along!
start_end_stats <- mogo_full_trip_list %>%
  group_by(Start_Station_Id, End_Station_Id) %>% #this groups both start and end stations
  summarize(
    count = n(), #this counts the amount of rows for each start an end station
    avg_trip_duration_sec = mean(Duration, na.rm=T), #this takes the average duration of each bike ride
    median_trip_duration_sec = median(Duration, na.rm=T)) #this takes the median duration of each bike ride

start_end_stats <- start_end_stats %>%
  filter(!is.na(count)) %>% #using the filter, we can take out any NA's
  mutate(
    avg_trip_min = avg_trip_duration_sec / 60, #mutate creates a new variable in the data set and we convert to minutes
    median_trip_min = median_trip_duration_sec / 60)

#this is the same thing as above but I am using the station names instead of the numbers to make it easier
start_end_names <- mogo_full_trip_list %>%
  group_by(Start_Station_Name, End_Station_Name) %>%
  summarize(
    count = n(),
    avg_trip_duration_sec = mean(Duration, na.rm=T),
    median_trip_duration_sec = median(Duration, na.rm=T))

start_end_names <- start_end_names %>%
  filter(!is.na(count)) %>%
  mutate(
    avg_trip_min = avg_trip_duration_sec / 60,
    median_trip_min = median_trip_duration_sec / 60)

#using the unite function, I joined the Start and End Station names and added "to"
#For example, if the start station was "Ford Field" and the end station was "MGM" it would say "Ford Field to MGM"
start_end_names <- start_end_names %>%
  unite("Start.and.End.of.Trip", Start_Station_Name:End_Station_Name, remove = FALSE, sep = " to ")

bar_chart <- start_end_names %>%
  filter(count > 1500) #we make the data for the bar chart by filtering out to only get locations with rides over 1500

bar_chart$Start.and.End.of.Trip = with(bar_chart, reorder(Start.and.End.of.Trip, count, median)) #this is how to order the bar chart from highest count to lowest count

#And now we can make our first bar chart!
ggplot(bar_chart, aes(x=Start.and.End.of.Trip, y=count)) + #"bar_chart" is the data set we just made
  geom_bar(aes(fill=desc(count)), stat = "identity") + #geom_bar is the funciton that creates the bars
  coord_flip() + #coord_flip flips the bars onto the y-axis
  theme_bw() +
  labs(y = "Amount of Rides",   #labs stands for 'labels'
       x = "Start and End Location of Each Trip",
       title = "The Top 25 Most Popular Locations and Their Amount of Rides",
       caption = "Data from Stirista | Graph by Tej Seth") +
  theme(axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        plot.title = element_text(size = 16, hjust = 0.5, face="bold"), #makes the font bold
        plot.subtitle = element_text(size = 14, hjust = 0.5),
        plot.caption = element_text(size = 12)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) #make your number lables pretty!
ggsave("popular_locations.png", dpi=300) #saves the image to your computer

#using case_when, we can mutate a new variable by having a formula that has a couple inputs
mogo_full_trip_list <- mogo_full_trip_list %>%
  mutate(ends_same = case_when(
    Start_Station_Id == End_Station_Id ~ "Starts and Ends At Same Location",
    Start_Station_Id != End_Station_Id ~ "Starts and Ends At Different Location")) %>%
  filter(!is.na(ends_same))

mogo_full_trip_list <- mogo_full_trip_list %>%
  mutate(minutes = Duration / 60)

#Filtering out outliers
filtered_trip_list <- mogo_full_trip_list %>%
  filter(minutes < 50)

#Making a boxplot!
p <- filtered_trip_list %>%
  ggplot( aes(y=ends_same, x=minutes, fill=ends_same)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  theme_ipsum() +
  labs(x = "Minutes Per Ride",
       y = "",
       title = "Rides That End at the Same Location Have More Volatility in Minutes",
       caption = "Data from Stirista | Graph by Tej Seth") +
  theme(axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
        plot.subtitle = element_text(size = 14, hjust = 0.5),
        plot.caption = element_text(size = 10))
p + theme(legend.position = "none") #do this so that the legend doesn't pop up
ggsave("tide_type_minutes.png", dpi=300)

ends_same_stats <- filtered_trip_list %>%
  filter(!is.na(Product_Name)) %>%
  group_by(Product_Name, ends_same) %>%
  summarize(
    count = n(),
    avg_min = mean(minutes)) %>%
  filter(count > 300)

#Making a stacked bar chart!
ggplot(ends_same_stats, aes(fill=Product_Name, y=count, x=ends_same)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_viridis(discrete = T) +
  theme_ipsum() +
  labs(x = "",
       y = "Amount of Rides",
       title = "Which Passes Are Being Used and Where?",
       caption = "Data from Stirista | Graph by Tej Seth") +
  theme(axis.title = element_text(size = 14),
        axis.text = element_text(size = 14),
        plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
        plot.subtitle = element_text(size = 14, hjust = 0.5),
        plot.caption = element_text(size = 10)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
ggsave("mogo_stack_bar.png", dpi=300)

#Grouping by types of passes at each location
passes_at_locations <- filtered_trip_list %>%
  filter(!is.na(Product_Name)) %>%
  group_by(Product_Name, Start_Station_Name) %>%
  summarize(
    count = n(),
    avg_min = mean(minutes)) %>%
  arrange(desc(count))

#filtering for just annual passes
annual_passes <- filtered_trip_list %>%
  filter(Product_Name == "Annual Pass") %>%
  group_by(Start_Station_Name) %>%
  summarise(
    annual_count = n(),
    annual_min = mean(minutes))

#filtering for just daily passes
daily_pass <- filtered_trip_list %>%
  filter(Product_Name == "Daily Pass") %>%
  group_by(Start_Station_Name) %>%
  summarise(
    daily_count = n(),
    daily_min = mean(minutes))

#merging the two data sets together by what they have in common
annual_daily <- merge(annual_passes, daily_pass, by="Start_Station_Name")
	#install.packages("tidyverse")
	#install.packages("forcats")
	#install.packages("hrbrthemes")
	#install.packages("viridis")
	#install.packages("ggrepel")

	#Take out the '#' when installing the packages. I only included that in case you've already installed them.
	#Tidyverse is the main one you need and the others are just for design.

	library(tidyverse)
	library(forcats)
	library(hrbrthemes)
	library(viridis)
	library(ggrepel)
	#You have to load the packages every time you fire up R. It takes a couple seconds

	#After importing the data set by clicking on "Enviroment" and clicking "Import Dataset" pick "From Text (base)", set headers to yes
	#Make sure to rename the data set to "mogo_full_trip_list" if you want to follow along!
	start_end_stats <- mogo_full_trip_list %>%
	group_by(Start_Station_Id, End_Station_Id) %>% #this groups both start and end stations
	summarize(
	count = n(), #this counts the amount of rows for each start an end station
	avg_trip_duration_sec = mean(Duration, na.rm=T), #this takes the average duration of each bike ride
	median_trip_duration_sec = median(Duration, na.rm=T)) #this takes the median duration of each bike ride

	start_end_stats <- start_end_stats %>%
	filter(!is.na(count)) %>% #using the filter, we can take out any NA's
	mutate(
	avg_trip_min = avg_trip_duration_sec / 60, #mutate creates a new variable in the data set and we convert to minutes
	median_trip_min = median_trip_duration_sec / 60)

	#this is the same thing as above but I am using the station names instead of the numbers to make it easier
	start_end_names <- mogo_full_trip_list %>%
	group_by(Start_Station_Name, End_Station_Name) %>%
	summarize(
	count = n(),
	avg_trip_duration_sec = mean(Duration, na.rm=T),
	median_trip_duration_sec = median(Duration, na.rm=T))

	start_end_names <- start_end_names %>%
	filter(!is.na(count)) %>%
	mutate(
	avg_trip_min = avg_trip_duration_sec / 60,
	median_trip_min = median_trip_duration_sec / 60)

	#using the unite function, I joined the Start and End Station names and added "to"
	#For example, if the start station was "Ford Field" and the end station was "MGM" it would say "Ford Field to MGM"
	start_end_names <- start_end_names %>%
	unite("Start.and.End.of.Trip", Start_Station_Name:End_Station_Name, remove = FALSE, sep = " to ")

	bar_chart <- start_end_names %>%
	filter(count > 1500) #we make the data for the bar chart by filtering out to only get locations with rides over 1500

	bar_chart$Start.and.End.of.Trip = with(bar_chart, reorder(Start.and.End.of.Trip, count, median)) #this is how to order the bar chart from highest count to lowest count

	#And now we can make our first bar chart!
	ggplot(bar_chart, aes(x=Start.and.End.of.Trip, y=count)) + #"bar_chart" is the data set we just made
	geom_bar(aes(fill=desc(count)), stat = "identity") + #geom_bar is the funciton that creates the bars
	coord_flip() + #coord_flip flips the bars onto the y-axis
	theme_bw() +
	labs(y = "Amount of Rides", #labs stands for 'labels'
	x = "Start and End Location of Each Trip",
	title = "The Top 25 Most Popular Locations and Their Amount of Rides",
	caption = "Data from Stirista \| Graph by Tej Seth") +
	theme(axis.title = element_text(size = 12),
	axis.text = element_text(size = 10),
	plot.title = element_text(size = 16, hjust = 0.5, face="bold"), #makes the font bold
	plot.subtitle = element_text(size = 14, hjust = 0.5),
	plot.caption = element_text(size = 12)) +
	scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) #make your number lables pretty!
	ggsave("popular_locations.png", dpi=300) #saves the image to your computer

	#using case_when, we can mutate a new variable by having a formula that has a couple inputs
	mogo_full_trip_list <- mogo_full_trip_list %>%
	mutate(ends_same = case_when(
	Start_Station_Id == End_Station_Id ~ "Starts and Ends At Same Location",
	Start_Station_Id != End_Station_Id ~ "Starts and Ends At Different Location")) %>%
	filter(!is.na(ends_same))

	mogo_full_trip_list <- mogo_full_trip_list %>%
	mutate(minutes = Duration / 60)

	#Filtering out outliers
	filtered_trip_list <- mogo_full_trip_list %>%
	filter(minutes < 50)

	#Making a boxplot!
	p <- filtered_trip_list %>%
	ggplot( aes(y=ends_same, x=minutes, fill=ends_same)) +
	geom_boxplot() +
	scale_fill_viridis(discrete = TRUE, alpha=0.6) +
	theme_ipsum() +
	labs(x = "Minutes Per Ride",
	y = "",
	title = "Rides That End at the Same Location Have More Volatility in Minutes",
	caption = "Data from Stirista \| Graph by Tej Seth") +
	theme(axis.title = element_text(size = 14),
	axis.text = element_text(size = 14),
	plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
	plot.subtitle = element_text(size = 14, hjust = 0.5),
	plot.caption = element_text(size = 10))
	p + theme(legend.position = "none") #do this so that the legend doesn't pop up
	ggsave("tide_type_minutes.png", dpi=300)

	ends_same_stats <- filtered_trip_list %>%
	filter(!is.na(Product_Name)) %>%
	group_by(Product_Name, ends_same) %>%
	summarize(
	count = n(),
	avg_min = mean(minutes)) %>%
	filter(count > 300)

	#Making a stacked bar chart!
	ggplot(ends_same_stats, aes(fill=Product_Name, y=count, x=ends_same)) +
	geom_bar(position="stack", stat="identity") +
	scale_fill_viridis(discrete = T) +
	theme_ipsum() +
	labs(x = "",
	y = "Amount of Rides",
	title = "Which Passes Are Being Used and Where?",
	caption = "Data from Stirista \| Graph by Tej Seth") +
	theme(axis.title = element_text(size = 14),
	axis.text = element_text(size = 14),
	plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
	plot.subtitle = element_text(size = 14, hjust = 0.5),
	plot.caption = element_text(size = 10)) +
	scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
	ggsave("mogo_stack_bar.png", dpi=300)

	#Grouping by types of passes at each location
	passes_at_locations <- filtered_trip_list %>%
	filter(!is.na(Product_Name)) %>%
	group_by(Product_Name, Start_Station_Name) %>%
	summarize(
	count = n(),
	avg_min = mean(minutes)) %>%
	arrange(desc(count))

	#filtering for just annual passes
	annual_passes <- filtered_trip_list %>%
	filter(Product_Name == "Annual Pass") %>%
	group_by(Start_Station_Name) %>%
	summarise(
	annual_count = n(),
	annual_min = mean(minutes))

	#filtering for just daily passes
	daily_pass <- filtered_trip_list %>%
	filter(Product_Name == "Daily Pass") %>%
	group_by(Start_Station_Name) %>%
	summarise(
	daily_count = n(),
	daily_min = mean(minutes))

	#merging the two data sets together by what they have in common
	annual_daily <- merge(annual_passes, daily_pass, by="Start_Station_Name")