Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Created August 22, 2016 00:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shuozhang1985/da03c78e983b56f305650c4ac628b719 to your computer and use it in GitHub Desktop.
Save shuozhang1985/da03c78e983b56f305650c4ac628b719 to your computer and use it in GitHub Desktop.
setwd("~/Desktop/web scraping")
library(dplyr)
library(ggplot2)
library(dygraphs)
library(plotly)
event=read.csv('gender2.txt', header=T, stringsAsFactors = F, sep=',')
nrow(event)
View(event)
#length(event$Event)
lst2=filter(event, grepl('Mixed', Event))
#length(lst2$Event)
lst1=filter(event, !grepl('Mixed', Event))
temp=strsplit(lst1$Event, split="'")
A=matrix(unlist(temp), ncol=2, byrow=TRUE)
sport=as.data.frame(A)
sport=cbind(sport, lst1$Sport)
colnames(sport)=c('gender', 'event', 'sport')
#View(sport)
sport%>%
dplyr::group_by(gender)%>%
dplyr::summarise(n=n())
sport2=sport%>%
dplyr::group_by(event)%>%
dplyr::summarise(n=n())
common=dplyr::filter(sport2, n>=2)
uncommon=dplyr::filter(sport2, n==1)
uncommon1=merge(x=uncommon, y=sport, by='event', all.x = T)
#View(uncommon1)
uncommon1%>%
dplyr::group_by(gender)%>%
dplyr::summarise(n())
Event=c('men','men', 'women', 'women','mixed')
Category=c('only to men', 'equal to men and women', 'only to women', 'equal to men and women',
'equal to men and women')
number=c(42, 120, 15, 117, 8 )
df=as.data.frame(Event,colnames=c('event'))
df$Category=Category
df$Number=number
df$Percent=df$Number/nrow(event)
ggplot(data=df, aes(x=reorder(Event, Percent), y=Percent, fill=Category))+
geom_bar(stat = 'identity')+
xlab('Event')+
ylab('Percent')+
ggtitle('Gender distribution of events in 2012 Olympics')+
theme_bw()+
theme(legend.position='bottom')
uncommonbysport=uncommon1%>%
group_by(gender,sport)%>%
dplyr::summarise(n=n())
ggplot(data=uncommonbysport, aes(x=reorder(sport,n), y=n, fill=gender))+
geom_bar(stat='identity')+
xlab('Sport')+
ylab('Number of inequal events')+
ggtitle('Events of inequal gender in 2012 Olympics')+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#competitor
total=read.csv('event.txt', header=T, stringsAsFactors = F, sep=',')
total$Women=as.numeric(gsub(',', '', total$Women))
total$Men=as.numeric(gsub(',','',total$Men))
total$Year=as.numeric(total$Year)
summary(total)
total=mutate(total, womenpercent=Women/sum(Men+Women), menpercent=Men/sum(Women+Men))
plot(total$Year, total$Men, type='l', col='red', xlab='Year', ylab='Number of athelets',
main='Athletes vs Gender')
lines(total$Year, total$Women,col='blue')
legend("topleft", inset=.05,
c("Women","Men"),
lty=c(1,1),
lwd=c(2.5,2.5),
col=c('blue', 'red'), horiz=FALSE)
people=read.csv('gender.txt', header=T, stringsAsFactors = F, sep=',')
summary(people)
people1=people%>%
group_by(Gender)%>%
dplyr::summarise(n=n())
df1=people%>%
group_by(Gender, Gold)%>%
dplyr::summarise(n=n())
df2=people%>%
group_by(Gender, Silver)%>%
dplyr::summarise(n=n())
df3=people%>%
group_by(Gender, Bronze)%>%
dplyr::summarise(n=n())
Gender=c('Female','Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male')
Metal=c('Gold', 'Silver', 'Bronze', 'No metal', 'Gold', 'Silver', 'Bronze', 'No metal')
Number=c(sum(filter(df1, Gender=='Female',Gold>=1)$n),
sum(filter(df2, Gender=='Female',Silver>=1)$n),
sum(filter(df3, Gender=='Female',Bronze>=1)$n),
filter(people1, Gender=='Female')$n-sum(filter(df1, Gender=='Female',Gold>=1)$n)-sum(filter(df2, Gender=='Female',Silver>=1)$n)- sum(filter(df3, Gender=='Female',Bronze>=1)$n),
sum(filter(df1, Gender=='Male',Gold>=1)$n),
sum(filter(df2, Gender=='Male',Silver>=1)$n),
sum(filter(df3, Gender=='Male',Bronze>=1)$n),
filter(people1, Gender=='Male')$n-sum(filter(df1, Gender=='Male',Gold>=1)$n)-sum(filter(df2, Gender=='Male',Silver>=1)$n)- sum(filter(df3, Gender=='Male',Bronze>=1)$n)
)
df4=as.data.frame(Gender, colnames=c('Gender'))
df4$Metal=Metal
df4$Number=Number
df4=mutate(df4,Percent=Number/sum(Number))
ggplot(data=df4, aes(x=Gender, y=Number, fill=Metal))+
geom_bar(stat='identity')+
xlab('Gender')+
ylab('Number of metal')+
ggtitle('Metal distribution by gender in 2012 Olympics')+
theme_bw()
# age
totalage=select(people, Age)
totalage$Type=rep('All', nrow(totalage))
age1=filter(people, Gold>=1)%>%
select(Age, Metal= Gold)
age1= mutate(age1, Type=rep('Gold',nrow(age1)))
summary(age1)
age2=filter(people, Silver>=1)%>%
select(Age, Metal=Silver)
summary(age2)
age2=mutate(age2, Type=rep('Silver', nrow(age2)))
age3=filter(people, Bronze>=1)%>%
select(Age,Metal= Bronze)
age3=mutate(age3, Type=rep('Bronze', nrow(age3)))
age=rbind(totalage, select(age1, Age, Type), select(age2, Age, Type),
select(age3, Age, Type))
ggplot(data=age, aes(Age, color=Type))+
geom_density(alpha=0.2)+
xlab('Age')+
ylab('Density')+
ggtitle('Athletes age distribution in 2012 Olympics')+
theme_bw()+
theme(legend.position='bottom')
people[is.na(people)]=0
people=mutate(people, Total=Gold+Silver+Bronze)
summary(people)
peoplebysport=filter(people, Total>=1)%>%
group_by(Age, Sport)%>%
dplyr::summarise(n=n())
ggplot(peoplebysport, aes(x=Age, y=n, fill=Sport))+
geom_bar(stat = 'identity')+
xlab('Age')+
ylab('Number of Medals')+
ggtitle('Medal winners age distribution by sport in 2012 Olympics')+
theme_bw()+
theme(legend.position='bottom')
ggplot(filter(peoplebysport, Sport %in% c('Equestrianism', 'Gymnastics')), aes(x=Age, y=n, fill=Sport))+
geom_bar(stat = 'identity')+
xlab('Age')+
ylab('Number of Medals')+
ggtitle('Medal winners age distribution by sport in 2012 Olympics')+
theme_bw()+
theme(legend.position='bottom')
# sport
peoplebysport1=filter(people, Gold>=1)%>%
group_by(Sport, Gold)%>%
dplyr::summarise(n=n())%>%
select(Sport, n)%>%
peoplebysport1$Type=rep('Gold', nrow(peoplebysport1))
peoplebysport2=filter(people, Silver>=1)%>%
group_by(Sport, Silver)%>%
dplyr::summarise(n=n())%>%
select(Sport, n)
peoplebysport2$Type=rep('Silver', nrow(peoplebysport2))
peoplebysport3=filter(people, Bronze>=1)%>%
group_by(Sport, Bronze)%>%
dplyr::summarise(n=n())%>%
select(Sport, n)
peoplebysport3$Type=rep('Bronze', nrow(peoplebysport3))
peoplebysport=rbind(peoplebysport1, peoplebysport2, peoplebysport3)
ggplot(peoplebysport, aes(x=reorder(Sport, n), y=n, fill=Type))+
geom_bar(stat = 'identity')+
xlab('Sport')+
ylab('Number of medal')+
ggtitle('Medal distribution by sport in 2012 Olympics')+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position='bottom')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment