Created
October 21, 2020 03:45
-
-
Save tejseth/947f468661d91ca8d69e2e8d7f1daea5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install.packages("tidyverse") | |
#install.packages("forcats") | |
#install.packages("hrbrthemes") | |
#install.packages("viridis") | |
#install.packages("ggrepel") | |
#Take out the '#' when installing the packages. I only included that in case you've already installed them. | |
#Tidyverse is the main one you need and the others are just for design. | |
library(tidyverse) | |
library(forcats) | |
library(hrbrthemes) | |
library(viridis) | |
library(ggrepel) | |
#You have to load the packages every time you fire up R. It takes a couple seconds | |
#After importing the data set by clicking on "Enviroment" and clicking "Import Dataset" pick "From Text (base)", set headers to yes | |
#Make sure to rename the data set to "mogo_full_trip_list" if you want to follow along! | |
start_end_stats <- mogo_full_trip_list %>% | |
group_by(Start_Station_Id, End_Station_Id) %>% #this groups both start and end stations | |
summarize( | |
count = n(), #this counts the amount of rows for each start an end station | |
avg_trip_duration_sec = mean(Duration, na.rm=T), #this takes the average duration of each bike ride | |
median_trip_duration_sec = median(Duration, na.rm=T)) #this takes the median duration of each bike ride | |
start_end_stats <- start_end_stats %>% | |
filter(!is.na(count)) %>% #using the filter, we can take out any NA's | |
mutate( | |
avg_trip_min = avg_trip_duration_sec / 60, #mutate creates a new variable in the data set and we convert to minutes | |
median_trip_min = median_trip_duration_sec / 60) | |
#this is the same thing as above but I am using the station names instead of the numbers to make it easier | |
start_end_names <- mogo_full_trip_list %>% | |
group_by(Start_Station_Name, End_Station_Name) %>% | |
summarize( | |
count = n(), | |
avg_trip_duration_sec = mean(Duration, na.rm=T), | |
median_trip_duration_sec = median(Duration, na.rm=T)) | |
start_end_names <- start_end_names %>% | |
filter(!is.na(count)) %>% | |
mutate( | |
avg_trip_min = avg_trip_duration_sec / 60, | |
median_trip_min = median_trip_duration_sec / 60) | |
#using the unite function, I joined the Start and End Station names and added "to" | |
#For example, if the start station was "Ford Field" and the end station was "MGM" it would say "Ford Field to MGM" | |
start_end_names <- start_end_names %>% | |
unite("Start.and.End.of.Trip", Start_Station_Name:End_Station_Name, remove = FALSE, sep = " to ") | |
bar_chart <- start_end_names %>% | |
filter(count > 1500) #we make the data for the bar chart by filtering out to only get locations with rides over 1500 | |
bar_chart$Start.and.End.of.Trip = with(bar_chart, reorder(Start.and.End.of.Trip, count, median)) #this is how to order the bar chart from highest count to lowest count | |
#And now we can make our first bar chart! | |
ggplot(bar_chart, aes(x=Start.and.End.of.Trip, y=count)) + #"bar_chart" is the data set we just made | |
geom_bar(aes(fill=desc(count)), stat = "identity") + #geom_bar is the funciton that creates the bars | |
coord_flip() + #coord_flip flips the bars onto the y-axis | |
theme_bw() + | |
labs(y = "Amount of Rides", #labs stands for 'labels' | |
x = "Start and End Location of Each Trip", | |
title = "The Top 25 Most Popular Locations and Their Amount of Rides", | |
caption = "Data from Stirista | Graph by Tej Seth") + | |
theme(axis.title = element_text(size = 12), | |
axis.text = element_text(size = 10), | |
plot.title = element_text(size = 16, hjust = 0.5, face="bold"), #makes the font bold | |
plot.subtitle = element_text(size = 14, hjust = 0.5), | |
plot.caption = element_text(size = 12)) + | |
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) #make your number lables pretty! | |
ggsave("popular_locations.png", dpi=300) #saves the image to your computer | |
#using case_when, we can mutate a new variable by having a formula that has a couple inputs | |
mogo_full_trip_list <- mogo_full_trip_list %>% | |
mutate(ends_same = case_when( | |
Start_Station_Id == End_Station_Id ~ "Starts and Ends At Same Location", | |
Start_Station_Id != End_Station_Id ~ "Starts and Ends At Different Location")) %>% | |
filter(!is.na(ends_same)) | |
mogo_full_trip_list <- mogo_full_trip_list %>% | |
mutate(minutes = Duration / 60) | |
#Filtering out outliers | |
filtered_trip_list <- mogo_full_trip_list %>% | |
filter(minutes < 50) | |
#Making a boxplot! | |
p <- filtered_trip_list %>% | |
ggplot( aes(y=ends_same, x=minutes, fill=ends_same)) + | |
geom_boxplot() + | |
scale_fill_viridis(discrete = TRUE, alpha=0.6) + | |
theme_ipsum() + | |
labs(x = "Minutes Per Ride", | |
y = "", | |
title = "Rides That End at the Same Location Have More Volatility in Minutes", | |
caption = "Data from Stirista | Graph by Tej Seth") + | |
theme(axis.title = element_text(size = 14), | |
axis.text = element_text(size = 14), | |
plot.title = element_text(size = 16, hjust = 0.5, face="bold"), | |
plot.subtitle = element_text(size = 14, hjust = 0.5), | |
plot.caption = element_text(size = 10)) | |
p + theme(legend.position = "none") #do this so that the legend doesn't pop up | |
ggsave("tide_type_minutes.png", dpi=300) | |
ends_same_stats <- filtered_trip_list %>% | |
filter(!is.na(Product_Name)) %>% | |
group_by(Product_Name, ends_same) %>% | |
summarize( | |
count = n(), | |
avg_min = mean(minutes)) %>% | |
filter(count > 300) | |
#Making a stacked bar chart! | |
ggplot(ends_same_stats, aes(fill=Product_Name, y=count, x=ends_same)) + | |
geom_bar(position="stack", stat="identity") + | |
scale_fill_viridis(discrete = T) + | |
theme_ipsum() + | |
labs(x = "", | |
y = "Amount of Rides", | |
title = "Which Passes Are Being Used and Where?", | |
caption = "Data from Stirista | Graph by Tej Seth") + | |
theme(axis.title = element_text(size = 14), | |
axis.text = element_text(size = 14), | |
plot.title = element_text(size = 16, hjust = 0.5, face="bold"), | |
plot.subtitle = element_text(size = 14, hjust = 0.5), | |
plot.caption = element_text(size = 10)) + | |
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) | |
ggsave("mogo_stack_bar.png", dpi=300) | |
#Grouping by types of passes at each location | |
passes_at_locations <- filtered_trip_list %>% | |
filter(!is.na(Product_Name)) %>% | |
group_by(Product_Name, Start_Station_Name) %>% | |
summarize( | |
count = n(), | |
avg_min = mean(minutes)) %>% | |
arrange(desc(count)) | |
#filtering for just annual passes | |
annual_passes <- filtered_trip_list %>% | |
filter(Product_Name == "Annual Pass") %>% | |
group_by(Start_Station_Name) %>% | |
summarise( | |
annual_count = n(), | |
annual_min = mean(minutes)) | |
#filtering for just daily passes | |
daily_pass <- filtered_trip_list %>% | |
filter(Product_Name == "Daily Pass") %>% | |
group_by(Start_Station_Name) %>% | |
summarise( | |
daily_count = n(), | |
daily_min = mean(minutes)) | |
#merging the two data sets together by what they have in common | |
annual_daily <- merge(annual_passes, daily_pass, by="Start_Station_Name") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment