Skip to content

Instantly share code, notes, and snippets.

@ashomah
Last active January 21, 2019 19:00
Show Gist options
  • Save ashomah/a4d041e1965c4b41e783228a9c43fc06 to your computer and use it in GitHub Desktop.
Save ashomah/a4d041e1965c4b41e783228a9c43fc06 to your computer and use it in GitHub Desktop.
### We have selected a dataset about statistics of the Tour de France results, from 1903 to 2016.
### The dataset has been selected from this list: [Tour de France Dataset](http://www.makeovermonday.co.uk/data/data-sets-2017/).
### All documents of this project can be found on GitHub: [Assignment 01 - GitHub](https://github.com/ashomah/Data-Vizualization-Course/tree/master/Assignment%2001%20-%20R).
### The code can be found on GitHub as a Gist: [Assignment 01 - Gist](https://gist.github.com/ashomah/a4d041e1965c4b41e783228a9c43fc06).
# INSTALL AND LOAD PACKAGES ----
packages_list <- c('ggplot2',
'ggalt',
'gridExtra',
'scales',
'grid',
'lattice',
'ggthemes',
'extrafont',
'plotly',
'plyr',
'leaflet',
'maps'
)
for (i in packages_list){
if(!i%in%installed.packages()){
install.packages(i, dependencies = TRUE, repos = "http://cran.us.r-project.org")
library(i, character.only = TRUE)
print(paste0(i, ' has been installed'))
} else {
print(paste0(i, ' is already installed'))
library(i, character.only = TRUE)
}
}
# READ DATASET FROM GIST AND ADD VARIABLES ----
tour_de_france <- read.csv("https://gist.githubusercontent.com/ashomah/e7c6f1e6c519b5eb301b8b51c00071f0/raw/3e4347bd5ab5ee3536870fc87ff498d97b546fc9/Tour_de_France_Dataset",
sep = ',',
header = TRUE)
# Add Duration
tour_de_france$Start.Date <- as.Date(tour_de_france$Start.Date, format = '%d/%m/%Y')
tour_de_france$End.Date <- as.Date(tour_de_france$End.Date, format = '%d/%m/%Y')
tour_de_france$Duration <- tour_de_france$End.Date - tour_de_france$Start.Date
# Add Distance per Stage
tour_de_france$Distance_per_Stage <- tour_de_france$Total.distance..km. / tour_de_france$Number.of.stages
# Add Withdrawal
tour_de_france$Withdrawal <- tour_de_france$Entrants - tour_de_france$Finishers
# Add Withdrawal Rate
tour_de_france$Withdrawal_Rate <- tour_de_france$Withdrawal / tour_de_france$Entrants
# Add Variable - "group" calling the right color of icon basis "Total distance km" column
tour_de_france$group=cut(as.numeric(tour_de_france$Total.distance..km.), breaks=c(0,4000,6000), labels = c('yellow','red'))
# Create new data-frame for frequency basis race starting point (used in maps)
dist=count(tour_de_france[ ,c("Starting.city.Longitude","Starting.city.Latitude","Starting.city")])
# COLOR PALETTE AND FONTS ----
# We used the colors of the Tour de France logo for our charts.
# Palette 1
color1 = 'black'
color2 = 'white'
color3 = 'gold1'
color4 = 'darkorchid3'
font1 = 'Impact'
font2 = 'Helvetica'
# Color Palette for Frequency Density (basis new df 'dist' and used in map)
pal = colorNumeric(palette = c(color3,color4), domain = dist$freq)
# Icons imported for maps
tdfIcons <- iconList(red = makeIcon("https://github.com/ashomah/Data-Vizualization-Course/blob/master/Assignment%2001%20-%20R/Map%20Icons/red.png?raw=true", iconWidth = 20, iconHeight =20),
yellow = makeIcon("https://github.com/ashomah/Data-Vizualization-Course/blob/master/Assignment%2001%20-%20R/Map%20Icons/yellow.png?raw=true", iconWidth = 20, iconHeight =20),
green = makeIcon("https://github.com/ashomah/Data-Vizualization-Course/blob/master/Assignment%2001%20-%20R/Map%20Icons/green.png?raw=true", iconWidth = 20, iconHeight =20),
blue = makeIcon("https://github.com/ashomah/Data-Vizualization-Course/blob/master/Assignment%2001%20-%20R/Map%20Icons/blue.png?raw=true", iconWidth = 20, iconHeight =20))
# 1. MAPS ----
# These maps aim to visualise the starting co-ordinates of each Tour de France race on the European map.
## Map A : Map basis the location and stats of each TDF event (change markeroptions to change 'Year' shown as title on hover; popup for changing the stats to be shown on click)
mapA = leaflet(tour_de_france) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addMarkers(lng=tour_de_france[tour_de_france$group=="red", "Starting.city.Longitude"],
lat=tour_de_france[tour_de_france$group=="red", "Starting.city.Latitude"],
popup=paste(paste0("Start City = ", tour_de_france[tour_de_france$group=="red","Starting.city"]),
paste0("Start Date = ", tour_de_france[tour_de_france$group=="red","Start.Date"]),
paste0("End Date = ", tour_de_france[tour_de_france$group=="red","End.Date"]),
paste0("Total Kms = ", tour_de_france[tour_de_france$group=="red","Total.distance..km."]),
paste0("Winner = ", tour_de_france[tour_de_france$group=="red","Winner"],
" (", tour_de_france[tour_de_france$group=="red","Winner.s.Team"],
" | ", tour_de_france[tour_de_france$group=="red","Winner.s.Nationality"], ")"),
sep="<br/>"),
options = markerOptions(interactive = TRUE, title = tour_de_france[tour_de_france$group=="red","Year"], riseOnHover = TRUE),
icon = tdfIcons$red, group = "Red Icons") %>%
addMarkers(lng=tour_de_france[tour_de_france$group=="yellow", "Starting.city.Longitude"],
lat=tour_de_france[tour_de_france$group=="yellow", "Starting.city.Latitude"],
popup=paste(paste0("Start City = ",tour_de_france[tour_de_france$group=="yellow","Starting.city"]),
paste0("Start Date = ",tour_de_france[tour_de_france$group=="yellow","Start.Date"]),
paste0("End Date = ",tour_de_france[tour_de_france$group=="yellow","End.Date"]),
paste0("Total Kms = ",tour_de_france[tour_de_france$group=="yellow","Total.distance..km."]),
paste0("Winner = ",tour_de_france[tour_de_france$group=="yellow","Winner"],
" (",tour_de_france[tour_de_france$group=="yellow","Winner.s.Team"],
" | ",tour_de_france[tour_de_france$group=="yellow","Winner.s.Nationality"],")"),
sep="<br/>"),
options = markerOptions(interactive = TRUE, title = tour_de_france[tour_de_france$group=="yellow","Year"], riseOnHover = TRUE),
icon = tdfIcons$yellow, group = "Yellow Icons") %>%
addLegend(title = "Starting points of Tour de France events (basis Total distance of the event",
position = "bottomleft",
labels = c("Total Distance < 4,000 Kms","Total Distance > 4,000 Kms"),
colors = c("Yellow", "red")) %>%
addScaleBar(position = "topleft") %>%
addLayersControl(overlayGroups = c("Red Icons","Yellow Icons"), position = "bottomright", options = layersControlOptions(collapsed = FALSE))
mapA
## Map B : Shows the frequency of start city
mapB = leaflet(dist) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addCircleMarkers(lng=dist[ ,"Starting.city.Longitude"], lat=dist[ ,"Starting.city.Latitude"],
radius = 8, fillColor =~ pal(freq), fillOpacity = (20*dist$freq/100),
stroke = TRUE, color =~ pal(freq), weight = 2*log(dist$freq),
popup = paste(paste0("Start City : ",dist$Starting.city),
paste0("Freq : ",dist$freq),sep = "<br/>")) %>%
addLegend(title = "Density of starting points for TDF events", position = "bottomleft", pal = pal, labels = c(1,40), bins = 5, values =~ as.numeric(dist$freq))
mapB
# 2. BASIC CHARTS - TIME SERIES ----
# These first charts aim to show the evolution of the race settings overtime.
# The Total Distance has decreased, year after year, while the number of stages has increased, leading the Average Distance per Stage to decrease even more.
# This increased the energy the runners can deploy during a stage, improving the overall race speed.
# Total Distance per Year
plot_total_distance <- ggplot(tour_de_france, aes(x=Year, y=Total.distance..km.)) +
geom_line(color = color3)+
theme_minimal()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color = color2, family = font2),
axis.ticks = element_blank(),
plot.background = element_rect(fill = color1, color = color1),
legend.position = 'None')+
expand_limits(y = -(max(tour_de_france$Total.distance..km.)-min(tour_de_france$Total.distance..km.))*2/10+min(tour_de_france$Total.distance..km.))+
annotate('text',
label = 'Total Distance',
family = font1,
color = color3,
x = max(tour_de_france$Year)-(max(tour_de_france$Year)-min(tour_de_france$Year))/8,
y = -(max(tour_de_france$Total.distance..km.)-min(tour_de_france$Total.distance..km.))/10+min(tour_de_france$Total.distance..km.),
size = 4)+
scale_x_continuous(breaks = c(1903, 1920, 1940, 1960, 1980, 2000, 2016), position = 'top')
# Number of Stages per Year
plot_stages <- ggplot(tour_de_france, aes(x=Year, y=Number.of.stages)) +
geom_line(color = color3)+
theme_minimal()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.background = element_rect(fill = color1, color = color1),
legend.position = 'None')+
expand_limits(y = -(max(tour_de_france$Number.of.stages)-min(tour_de_france$Number.of.stages))*2/10+min(tour_de_france$Number.of.stages))+
annotate('text',
label = 'Number of Stages',
family = font1,
color = color3,
x = max(tour_de_france$Year)-(max(tour_de_france$Year)-min(tour_de_france$Year))/8,
y = -(max(tour_de_france$Number.of.stages)-min(tour_de_france$Number.of.stages))/10+min(tour_de_france$Number.of.stages),
size = 4)
# Average Distance per Stage per Year
plot_distance_per_stage <- ggplot(tour_de_france, aes(x=Year, y=Distance_per_Stage)) +
geom_line(color = color3)+
theme_minimal()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.background = element_rect(fill = color1, color = color1),
legend.position = 'None')+
expand_limits(y = -(max(tour_de_france$Distance_per_Stage)-min(tour_de_france$Distance_per_Stage))*2/10+min(tour_de_france$Distance_per_Stage))+
annotate('text',
label = 'Distance per Stage',
family = font1,
color = color3,
x = max(tour_de_france$Year)-(max(tour_de_france$Year)-min(tour_de_france$Year))/8,
y = -(max(tour_de_france$Distance_per_Stage)-min(tour_de_france$Distance_per_Stage))/10+min(tour_de_france$Distance_per_Stage),
size = 4)
# Winner's Average Speed per Year
plot_winner_avg_speed <- ggplot(tour_de_france, aes(x=Year, y=Winner.s.avg.speed)) +
geom_line(color = color1)+
theme_minimal()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.background = element_rect(fill = color3, color = color3),
legend.position = 'None')+
expand_limits(y = -(max(tour_de_france$Winner.s.avg.speed)-min(tour_de_france$Winner.s.avg.speed))*2/10+min(tour_de_france$Winner.s.avg.speed))+
annotate('text',
label = 'Winner\'s Average Speed',
family = font1,
color = color1,
x = max(tour_de_france$Year)-(max(tour_de_france$Year)-min(tour_de_france$Year))/8,
y = -(max(tour_de_france$Winner.s.avg.speed)-min(tour_de_france$Winner.s.avg.speed))/10+min(tour_de_france$Winner.s.avg.speed),
size = 4)
# Plot
grid.arrange(plot_total_distance,
plot_stages,
plot_distance_per_stage,
plot_winner_avg_speed,
nrow = 4,
ncol = 1)
# 3. DUMBBELL CHART ----
# This chart compares the number of Entrants and the number of Finishers over time.
# Dumbbell Chart
ggplot(tour_de_france, aes(x=tour_de_france$Finishers,
xend=tour_de_france$Entrants,
y=tour_de_france$Year,
group=tour_de_france$Year))+
geom_dumbbell(colour = color2, colour_x = color2, size = 0.2, colour_xend = color3, size_xend = 1, dot_guide=FALSE, size_x = 1)+
labs(x=NULL, y=NULL)+
theme_tufte()+
theme(axis.text.y = element_text(colour = color2, size = 8, family = font2),
axis.text.x = element_text(colour = color2, size = 8, family = font2),
axis.ticks = element_blank(),
plot.title = element_text(color = color3, size = 14),
plot.background = element_rect(fill= color1)
)+
scale_y_continuous(breaks = c(1903, 1920, 1940, 1960, 1980, 2000, 2016))+ coord_flip()
# Titles
spacing <-10
grid.text(unit(0.8, 'npc'), unit(0.165,"npc"), check.overlap = T,just = "left",
label="Finishers",
gp=gpar(col=color2, fontsize=16,fontface="bold", fontfamily = font1))
grid.text(unit(0.8, 'npc'), unit(0.2,"npc"), check.overlap = T,just = "left",
label="Entrants",
gp=gpar(col=color3, fontsize=16,fontface="bold", fontfamily = font1))
# 4. WAFFLE ----
# This chart shows the proportion of wins for the top 3 countries. Other countries have been groups under the label *Others*.
# Waffle Data Preparation
winners_nationality <- as.character(tour_de_france$Winner.s.Nationality)
winners_nationality[!(winners_nationality %in% c('France', 'Belgium', 'Spain'))] <- 'Others'
nrows <- 10
df <- expand.grid(y = 1:nrows, x = 1:nrows)
categ_table <- round(table(winners_nationality) * ((nrows*nrows)/(length(winners_nationality))))
categ_table <- categ_table[c(2,1,4,3)]
df$category <- factor(rep(names(categ_table), categ_table))
# Plot
ggplot(df, aes(x = x, y = y, fill = category))+
geom_tile(color = "black", size = 0.5)+
scale_x_continuous(expand = c(0, 0))+
scale_y_continuous(expand = c(0, 0), trans = 'reverse')+
scale_fill_manual(values = c('orange', 'gold1', 'darkorange4', 'darkorange3'),
breaks = c('France', 'Belgium', 'Spain', 'Others'),
labels = c('France', 'Belgium', 'Spain', 'Others'))+
theme(title = element_text(),
legend.position = 'right',
legend.background = element_rect(fill = 'black'),
legend.key = element_rect(fill = 'black', color = 'black'),
legend.box.background = element_rect(fill = 'black', color = 'black'),
legend.title = element_blank(),
legend.text = element_text(margin = margin(r = 10), color = 'white', family = font2),
legend.spacing.x = unit(5,'pt'),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.background = element_rect(fill = 'black', color = 'black'),
plot.background = element_rect(fill = 'black', color = 'black'),
plot.margin = unit(c(5.5, 5.5, 50, 5.5),'point'))
grid.text(unit(0.68, 'npc'), unit(0.05,"npc"), check.overlap = T, just = "left",
label=paste(paste(rep(" ",spacing), collapse=''),"Wins per Country"),
gp=gpar(col=color3, fontsize=16,fontface="bold", fontfamily = font1))
# 5. SMALL MULTIPLE WITH TUFTE THEME ----
# These charts show on which Years each Nationality has won the competition.
# Data for Grey Background Data
tdf_no_nationality <- tour_de_france[,c('Year', 'Winner.s.avg.speed')]
# Plot
ggplot(tour_de_france, aes(x = Year, y = 1))+
geom_bar(data = tdf_no_nationality, stat = 'identity', alpha = 0.1, fill = color2,width = 1)+
geom_bar(stat = 'identity', fill = color3, width = 1)+
facet_wrap( ~ Winner.s.Nationality, scales = 'free')+
scale_x_continuous(breaks = c(1903, 2016))+
theme_tufte(ticks = FALSE, base_size = 15)+
theme(axis.text.y = element_blank(),
axis.text.x = element_text(color = color2, family = font2, size = 6),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title = element_blank(),
plot.background = element_rect(fill = color1),
strip.text = element_text(color = color2, family = font2, size = 8),
panel.spacing = unit(2, 'lines'))
spacing <-10
grid.text(unit(1, 'npc'), unit(0.1,"npc"), check.overlap = T,just = "right",
label=paste("Wins by Nationality",paste(rep(" ",spacing), collapse='')),
gp=gpar(col=color3, fontsize=16,fontface="bold", fontfamily = font1))
# 6. TUFTE CHART - BOXPLOTS ----
# This plot shows the distribution of Winner's Average Speed by Nationality.
ggplot(tour_de_france, aes(x = reorder(factor(Winner.s.Nationality), -(Winner.s.avg.speed), median), Winner.s.avg.speed))+
theme_tufte(base_size = 5, ticks=F)+
geom_tufteboxplot(outlier.colour = color3, color= color3, size = 1.5, median.type = 'line', whisker.type = 'line', hoffset = 0, width = 3)+
theme(plot.margin = unit(c(10,10,10,10),'pt'),
axis.title=element_blank(),
axis.text = element_text(colour = color2, family = font2, size = 10),
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
plot.background = element_rect(fill = color1))+
scale_y_continuous(expand = c(0, 0), limits = c(0,44), breaks = seq(0, 50, by = 20))+
annotate('text', label = "Winner's Average Speed", family = font1, color = color3, x = 12, y = 3, size = 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment