Skip to content

Instantly share code, notes, and snippets.

@tejseth
Created October 21, 2020 03:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tejseth/947f468661d91ca8d69e2e8d7f1daea5 to your computer and use it in GitHub Desktop.
Save tejseth/947f468661d91ca8d69e2e8d7f1daea5 to your computer and use it in GitHub Desktop.
#install.packages("tidyverse")
#install.packages("forcats")
#install.packages("hrbrthemes")
#install.packages("viridis")
#install.packages("ggrepel")
#Take out the '#' when installing the packages. I only included that in case you've already installed them.
#Tidyverse is the main one you need and the others are just for design.
library(tidyverse)
library(forcats)
library(hrbrthemes)
library(viridis)
library(ggrepel)
#You have to load the packages every time you fire up R. It takes a couple seconds
#After importing the data set by clicking on "Enviroment" and clicking "Import Dataset" pick "From Text (base)", set headers to yes
#Make sure to rename the data set to "mogo_full_trip_list" if you want to follow along!
start_end_stats <- mogo_full_trip_list %>%
group_by(Start_Station_Id, End_Station_Id) %>% #this groups both start and end stations
summarize(
count = n(), #this counts the amount of rows for each start an end station
avg_trip_duration_sec = mean(Duration, na.rm=T), #this takes the average duration of each bike ride
median_trip_duration_sec = median(Duration, na.rm=T)) #this takes the median duration of each bike ride
start_end_stats <- start_end_stats %>%
filter(!is.na(count)) %>% #using the filter, we can take out any NA's
mutate(
avg_trip_min = avg_trip_duration_sec / 60, #mutate creates a new variable in the data set and we convert to minutes
median_trip_min = median_trip_duration_sec / 60)
#this is the same thing as above but I am using the station names instead of the numbers to make it easier
start_end_names <- mogo_full_trip_list %>%
group_by(Start_Station_Name, End_Station_Name) %>%
summarize(
count = n(),
avg_trip_duration_sec = mean(Duration, na.rm=T),
median_trip_duration_sec = median(Duration, na.rm=T))
start_end_names <- start_end_names %>%
filter(!is.na(count)) %>%
mutate(
avg_trip_min = avg_trip_duration_sec / 60,
median_trip_min = median_trip_duration_sec / 60)
#using the unite function, I joined the Start and End Station names and added "to"
#For example, if the start station was "Ford Field" and the end station was "MGM" it would say "Ford Field to MGM"
start_end_names <- start_end_names %>%
unite("Start.and.End.of.Trip", Start_Station_Name:End_Station_Name, remove = FALSE, sep = " to ")
bar_chart <- start_end_names %>%
filter(count > 1500) #we make the data for the bar chart by filtering out to only get locations with rides over 1500
bar_chart$Start.and.End.of.Trip = with(bar_chart, reorder(Start.and.End.of.Trip, count, median)) #this is how to order the bar chart from highest count to lowest count
#And now we can make our first bar chart!
ggplot(bar_chart, aes(x=Start.and.End.of.Trip, y=count)) + #"bar_chart" is the data set we just made
geom_bar(aes(fill=desc(count)), stat = "identity") + #geom_bar is the funciton that creates the bars
coord_flip() + #coord_flip flips the bars onto the y-axis
theme_bw() +
labs(y = "Amount of Rides", #labs stands for 'labels'
x = "Start and End Location of Each Trip",
title = "The Top 25 Most Popular Locations and Their Amount of Rides",
caption = "Data from Stirista | Graph by Tej Seth") +
theme(axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.title = element_text(size = 16, hjust = 0.5, face="bold"), #makes the font bold
plot.subtitle = element_text(size = 14, hjust = 0.5),
plot.caption = element_text(size = 12)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) #make your number lables pretty!
ggsave("popular_locations.png", dpi=300) #saves the image to your computer
#using case_when, we can mutate a new variable by having a formula that has a couple inputs
mogo_full_trip_list <- mogo_full_trip_list %>%
mutate(ends_same = case_when(
Start_Station_Id == End_Station_Id ~ "Starts and Ends At Same Location",
Start_Station_Id != End_Station_Id ~ "Starts and Ends At Different Location")) %>%
filter(!is.na(ends_same))
mogo_full_trip_list <- mogo_full_trip_list %>%
mutate(minutes = Duration / 60)
#Filtering out outliers
filtered_trip_list <- mogo_full_trip_list %>%
filter(minutes < 50)
#Making a boxplot!
p <- filtered_trip_list %>%
ggplot( aes(y=ends_same, x=minutes, fill=ends_same)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_ipsum() +
labs(x = "Minutes Per Ride",
y = "",
title = "Rides That End at the Same Location Have More Volatility in Minutes",
caption = "Data from Stirista | Graph by Tej Seth") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 14),
plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
plot.subtitle = element_text(size = 14, hjust = 0.5),
plot.caption = element_text(size = 10))
p + theme(legend.position = "none") #do this so that the legend doesn't pop up
ggsave("tide_type_minutes.png", dpi=300)
ends_same_stats <- filtered_trip_list %>%
filter(!is.na(Product_Name)) %>%
group_by(Product_Name, ends_same) %>%
summarize(
count = n(),
avg_min = mean(minutes)) %>%
filter(count > 300)
#Making a stacked bar chart!
ggplot(ends_same_stats, aes(fill=Product_Name, y=count, x=ends_same)) +
geom_bar(position="stack", stat="identity") +
scale_fill_viridis(discrete = T) +
theme_ipsum() +
labs(x = "",
y = "Amount of Rides",
title = "Which Passes Are Being Used and Where?",
caption = "Data from Stirista | Graph by Tej Seth") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(size = 14),
plot.title = element_text(size = 16, hjust = 0.5, face="bold"),
plot.subtitle = element_text(size = 14, hjust = 0.5),
plot.caption = element_text(size = 10)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10))
ggsave("mogo_stack_bar.png", dpi=300)
#Grouping by types of passes at each location
passes_at_locations <- filtered_trip_list %>%
filter(!is.na(Product_Name)) %>%
group_by(Product_Name, Start_Station_Name) %>%
summarize(
count = n(),
avg_min = mean(minutes)) %>%
arrange(desc(count))
#filtering for just annual passes
annual_passes <- filtered_trip_list %>%
filter(Product_Name == "Annual Pass") %>%
group_by(Start_Station_Name) %>%
summarise(
annual_count = n(),
annual_min = mean(minutes))
#filtering for just daily passes
daily_pass <- filtered_trip_list %>%
filter(Product_Name == "Daily Pass") %>%
group_by(Start_Station_Name) %>%
summarise(
daily_count = n(),
daily_min = mean(minutes))
#merging the two data sets together by what they have in common
annual_daily <- merge(annual_passes, daily_pass, by="Start_Station_Name")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment