Skip to content

Instantly share code, notes, and snippets.

@shuozhang1985
Created August 22, 2016 00:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuozhang1985/a646354e92dadd010f08cc63f0d70a78 to your computer and use it in GitHub Desktop.
Save shuozhang1985/a646354e92dadd010f08cc63f0d70a78 to your computer and use it in GitHub Desktop.
setwd("~/Desktop/web scraping")
library(dplyr)
library(dygraphs)
library(reshape2)
library(ggplot2)
library(plotly)
library(car)
library(TSA)
library(forecast)
library(xts)
library(Rmisc)
library(NbClust)
library(cluster)
library(factoextra)
library(ggrepel)
library(corrplot)
library(PerformanceAnalytics)
output=read.csv('output.txt', header = T, stringsAsFactors = F)
output2012=filter(output, year==2012)
#summary(output)
#sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd)
# pick the top countries by gold and total
output_bycountry=output%>%
dplyr::group_by(Country)%>%
summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>%
arrange(desc(avggold))
head(output_bycountry,20)
# Country totoalgold avggold
# <chr> <int> <dbl>
# 1 Unified Team 45 45.000000
# 2 Soviet Union 395 43.888889
# 3 United States 990 36.666667 (1)
# 4 East Germany 153 30.600000
# 5 China 201 25.125000 (2)
# 6 Russia 134 19.142857 (3)
# 7 Germany 213 11.210526 (4)
# 8 West Germany 56 11.200000
# 9 Great Britain 245 8.750000 (5)
# 10 Italy 211 8.115385 (6)
# 11 France 222 7.928571 (7)
# 12 Ukraine 33 6.600000
# 13 Hungary 170 6.538462 (8)
# 14 Japan 130 6.500000 (9)
# 15 Sweden 144 5.538462 (10)
# 16 South Korea 81 5.400000 (11)
# 17 Australia 138 5.307692 (12)
# 18 Cuba 71 5.071429
# 19 Romania 88 4.888889
# 20 Finland 106 4.240000
output_bycountry2=output%>%
dplyr::group_by(Country)%>%
summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>%
arrange(desc(avgmetal))
head(output_bycountry2,20)
# Country totoalmetal avgmetal
# <chr> <int> <dbl>
# 1 Soviet Union 1010 112.22222
# 2 Unified Team 112 112.00000
# 3 United States 2425 89.81481 (1)
# 4 East Germany 409 81.80000
# 5 China 473 59.12500 (2)
# 6 Russia 405 57.85714 (3)
# 7 West Germany 204 40.80000
# 8 Germany 728 38.31579 (4)
# 9 Great Britain 811 28.96429 (5)
# 10 France 727 25.96429 (6)
# 11 Ukraine 115 23.00000
# 12 Italy 577 22.19231 (7)
# 13 Japan 400 20.00000 (8)
# 14 Sweden 498 19.15385 (9)
# 15 Hungary 490 18.84615 (10)
# 16 Australia 475 18.26923 (11)
# 17 Romania 301 16.72222
# 18 South Korea 243 16.20000 (12)
# 19 Belarus 75 15.00000
# 20 Cuba 202 14.42857
# pick the top countries
top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain',
'Russia', 'South Korea', 'Germany',
'France', 'Italy', 'Hungary',
'Japan', 'Australia', 'Brazil', 'Sweden'))
# plot dygraphs
df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total")
color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be',
'#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff')
dygraph(df2) %>%
dyOptions(colors = color) %>%
dyRangeSelector()
# ggplot of boxplot
ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+
geom_boxplot()+
xlab('Country')+
ylab('Number of total medals')+
ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+
theme_bw()+
theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1))
# population
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
summary(population)
population1_2012=population %>%
dplyr::filter(Place %in% output2012$Country)%>%
dplyr::select(Country=Place, Population=X2012)
population2_2012=population %>%
dplyr::filter(Place %in% c('China, Hong Kong SAR',
'United Kingdom',
'Iran (Islamic Republic of)',
'Republic of Moldova',
"Dem. People's Republic of Korea",
'Russian Federation',
'Republic of Korea',
'United States of America',
'Venezuela (Bolivarian Republic of)')) %>%
dplyr::select(Country=Place, Population=X2012)
population2_2012$Country=c('Hong Kong','North Korea','South Korea',
'Iran', 'Moldova',
'Russia', 'Great Britain',
'Venezuela', 'United States')
population_2012=rbind(population1_2012, population2_2012)
taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940))
population_2012=rbind(population_2012, taibei)
output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T)
#GDP
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
GDP1_2012=GDP %>%
dplyr::filter(Country.Name %in% output2012$Country)%>%
dplyr::select(Country=Country.Name, GDP=X2012)
merge(x=output2012, y=GDP1_2012, by='Country', all.x=T)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
'Iran, Islamic Rep.', 'Russian Federation',
'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
GDP2_2012=GDP %>%
dplyr::filter(Country.Name %in% addition)%>%
dplyr::select(Country=Country.Name, GDP=X2012)
GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela')
GDP_2012=rbind(GDP1_2012, GDP2_2012)
taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800))
GDP_2012=rbind(GDP_2012, taipei)
output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T)
output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population)
output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP)
#GDP growth
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
GDPgrowth1_2012=GDPgrowth %>%
dplyr::filter(Country.Name %in% output2012$Country)%>%
dplyr::select(Country=Country.Name, GDPgrowth=X2012)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
'Iran, Islamic Rep.', 'Russian Federation',
'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
GDPgrowth2_2012=GDPgrowth %>%
dplyr::filter(Country.Name %in% addition)%>%
dplyr::select(Country=Country.Name, GDPgrowth=X2012)
GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela')
GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012)
taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3))
GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1)
output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T)
output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth)
#life expectency
life=read.csv('life.csv', header = T, stringsAsFactors = F)
trim.leading <- function (x) sub("^\\s+", "", x)
life$Country=trim.leading(life$Country)
life_2012=life %>%
dplyr::filter(Country %in% output2012$Country)%>%
dplyr::select(Country, Life)
merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>%
dplyr::select(Country, Life)
addition=c('The Bahamas', 'United Kingdom')
life1_2012=life %>%
dplyr::filter(Country %in% addition)%>%
dplyr::select(Country, Life)
life1_2012$Country=c('Great Britain', 'Bahamas')
Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'),
Life=as.numeric(79.5, 74.65))
life2_2012=rbind(life_2012,life1_2012,Taibei)
output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1,
y=life2_2012, by='Country', all.x=T)
output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life)
summary(output2012_bypopulationGDP2)
#kmeans model
df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life)
chart.Correlation(df[,-1], histogram=TRUE, pch=19)
df$Population=df$Population/1000
df$GDP=df$GDP/1000
ddf=scale(df[,-1])
#d = dist(as.data.frame(scale(df[,-1])))
#fit.average = hclust(d, method = "complete")
# par(mfrow = c(1, 1))
# plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters")
# rect.hclust(fit.average, k = 5)
set.seed(1234)
fviz_nbclust(ddf, kmeans, method = "silhouette")
# nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans")
# table(nc$Best.n[1,])
# barplot(table(nc$Best.n[1,]),
# xlab="Numer of Clusters", ylab="Number of Criteria")
set.seed(1234)
fit.km <- kmeans(ddf, 5, nstart=25)
fviz_cluster(fit.km, data = ddf, geom = "point",
stand = FALSE, frame.type = "norm")
# fit.km$size
# fit.km$centers
# aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean)
df$cluster <- as.factor(fit.km$cluster)
df1=df%>%
dplyr::filter(cluster==1)%>%
arrange(desc(Total))
df2=df%>%
dplyr::filter(cluster==2)%>%
arrange(desc(Total))
df3=df%>%
dplyr::filter(cluster==3)%>%
arrange(desc(Total))
df4=df%>%
dplyr::filter(cluster==4)%>%
arrange(desc(Total))
df5=df%>%
dplyr::filter(cluster==5)%>%
arrange(desc(Total))
df8=rbind(df1, head(df4,5), head(df3, 5),
head(df2, 5),
df5)
g1=ggplot(df8, aes(Total,Population, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g2=ggplot(df8, aes(Total,GDP, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g9=ggplot(df8, aes(Total,Life, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
multiplot(g1,g2, g7, g9,cols = 2)
g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()
g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()
g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()
g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()
g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()
multiplot(g3,g4,g5,g8,g10, cols=2)
# 2008
output=read.csv('output.txt', header = T, stringsAsFactors = F)
output2008=filter(output, year==2008)
# population
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
population1_2008=population %>%
dplyr::filter(Place %in% output2008$Country)%>%
select(Country=Place, Population=X2008)
population2_2008=population %>%
dplyr::filter(Place %in% c('China, Hong Kong SAR',
'United Kingdom',
'Iran (Islamic Republic of)',
'Republic of Moldova',
"Dem. People's Republic of Korea",
'Russian Federation',
'Republic of Korea',
'United States of America',
'Venezuela (Bolivarian Republic of)')) %>%
select(Country=Place, Population=X2008)
population2_2008$Country=c('Hong Kong','North Korea','South Korea',
'Iran', 'Moldova',
'Russia', 'Great Britain',
'Venezuela', 'United States')
population_2008=rbind(population1_2008, population2_2008)
taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120))
population_2008=rbind(population_2008, taibei)
output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T)
#GDP
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
GDP1_2008=GDP %>%
dplyr::filter(Country.Name %in% output2008$Country)%>%
select(Country=Country.Name, GDP=X2008)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
'Iran, Islamic Rep.',
'Venezuela, RB')
GDP2_2008=GDP %>%
dplyr::filter(Country.Name %in% addition)%>%
select(Country=Country.Name, GDP=X2008)
GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
'Iran', 'Venezuela')
GDP_2008=rbind(GDP1_2008, GDP2_2008)
taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea',
'North Korea'),
GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800))
GDP_2008=rbind(GDP_2008, taipei)
output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T)
output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population)
output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP)
#GDP growth
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
GDPgrowth1_2008=GDPgrowth %>%
dplyr::filter(Country.Name %in% output2008$Country)%>%
select(Country=Country.Name, GDPgrowth=X2008)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
'Iran, Islamic Rep.',
'Venezuela, RB')
GDPgrowth2_2008=GDPgrowth %>%
dplyr::filter(Country.Name %in% addition)%>%
select(Country=Country.Name, GDPgrowth=X2008)
GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
'Iran', 'Venezuela')
GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008)
taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'),
GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1))
GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1)
output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T)
output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth)
#life expectency
life=read.csv('life1.csv', header = T, stringsAsFactors = F)
trim.leading <- function (x) sub("^\\s+", "", x)
life$Country=trim.leading(life$Country)
life_2008=life %>%
dplyr::filter(Country %in% output2008$Country)%>%
select(Country, Life)
merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>%
select(Country, Life)
addition=c('United Kingdom')
life1_2008=life %>%
dplyr::filter(Country %in% addition)%>%
select(Country, Life)
life1_2008$Country=c('Great Britain')
Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands',
'North Korea', 'South Korea'),
Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83))
life2_2008=rbind(life_2008,life1_2008,Taibei)
output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1,
y=life2_2008, by='Country', all.x=T)
output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life)
#summary(output2008_bypopulationGDP2)
df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life)
chart.Correlation(df[,-1], histogram=TRUE, pch=19)
# kmeans model
df$Population=df$Population/1000
df$GDP=df$GDP/1000
df=df[!df$Country=='Zimbabwe',]
ddf=scale(df[,-1])
set.seed(1234)
fviz_nbclust(ddf, kmeans, method = "silhouette")
set.seed(1234)
fit.km <- kmeans(ddf, 5, nstart=25)
fviz_cluster(fit.km, data = ddf, geom = "point",
stand = FALSE, frame.type = "norm")
df$cluster <- as.factor(fit.km$cluster)
ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',],
Country, Total), by='Country',
all = T)
g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
geom_boxplot()+
xlab('cluster')+
ylab('total medals of each country')+
ggtitle('cluster vs total medals')+
theme_bw()+
theme(legend.position='none')
g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()+
theme(legend.position='none')
g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()+
theme(legend.position='none')
g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()+
theme(legend.position='none')
g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
geom_point()+
geom_boxplot()+
xlab('Cluster')+
theme_bw()+
theme(legend.position='none')
multiplot(g3, g4,g5,g8,g10, cols=2)
ddf8=filter(df, Country %in% df8$Country)
g1=ggplot(ddf8, aes(Total,Population, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
g9=ggplot(ddf8, aes(Total,Life, color =cluster)) +
geom_point()+
geom_text_repel(aes(label=Country))+
theme_classic()+
theme_bw()
multiplot(g1,g2, g7, g9,cols = 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment