shuozhang1985/analysis of medal-winning

## analysis of medal-winning
setwd("~/Desktop/web scraping")
library(dplyr)
library(dygraphs)
library(reshape2)
library(ggplot2)
library(plotly)
library(car)
library(TSA)
library(forecast)
library(xts)
library(Rmisc)
library(NbClust)
library(cluster)
library(factoextra)
library(ggrepel)
library(corrplot)
library(PerformanceAnalytics)
output=read.csv('output.txt', header = T, stringsAsFactors = F)
output2012=filter(output, year==2012)
#summary(output)
#sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd)

# pick the top countries by gold and total
output_bycountry=output%>%
  dplyr::group_by(Country)%>%
  summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>%
  arrange(desc(avggold))
head(output_bycountry,20)
# Country totoalgold   avggold
# <chr>      <int>     <dbl>
#   1   Unified Team         45 45.000000
# 2   Soviet Union        395 43.888889
# 3  United States        990 36.666667 (1)
# 4   East Germany        153 30.600000
# 5          China        201 25.125000 (2)
# 6         Russia        134 19.142857 (3)
# 7        Germany        213 11.210526 (4)
# 8   West Germany         56 11.200000
# 9  Great Britain        245  8.750000 (5)
# 10         Italy        211  8.115385 (6)
# 11        France        222  7.928571 (7)
# 12       Ukraine         33  6.600000
# 13       Hungary        170  6.538462 (8)
# 14         Japan        130  6.500000 (9)
# 15        Sweden        144  5.538462 (10)
# 16   South Korea         81  5.400000 (11)
# 17     Australia        138  5.307692 (12)
# 18          Cuba         71  5.071429
# 19       Romania         88  4.888889
# 20       Finland        106  4.240000
output_bycountry2=output%>%
  dplyr::group_by(Country)%>%
  summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>%
  arrange(desc(avgmetal))
head(output_bycountry2,20)
# Country totoalmetal  avgmetal
# <chr>       <int>     <dbl>
#   1   Soviet Union        1010 112.22222
# 2   Unified Team         112 112.00000
# 3  United States        2425  89.81481 (1)
# 4   East Germany         409  81.80000
# 5          China         473  59.12500  (2)
# 6         Russia         405  57.85714 (3)
# 7   West Germany         204  40.80000
# 8        Germany         728  38.31579 (4)
# 9  Great Britain         811  28.96429 (5)
# 10        France         727  25.96429 (6)
# 11       Ukraine         115  23.00000
# 12         Italy         577  22.19231 (7)
# 13         Japan         400  20.00000 (8)
# 14        Sweden         498  19.15385 (9)
# 15       Hungary         490  18.84615 (10)
# 16     Australia         475  18.26923 (11)
# 17       Romania         301  16.72222
# 18   South Korea         243  16.20000 (12)
# 19       Belarus          75  15.00000
# 20          Cuba         202  14.42857

# pick the top countries
top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain',
                                          'Russia', 'South Korea', 'Germany',
                                          'France', 'Italy', 'Hungary',
                                          'Japan', 'Australia', 'Brazil', 'Sweden'))

# plot dygraphs
df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total")
color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be',
        '#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff')
dygraph(df2) %>%
  dyOptions(colors = color) %>%
  dyRangeSelector()

# ggplot of  boxplot
ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+
  geom_boxplot()+
  xlab('Country')+
  ylab('Number of total medals')+
  ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+
  theme_bw()+
  theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1))

# population
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
summary(population)

population1_2012=population %>%
  dplyr::filter(Place %in% output2012$Country)%>%
  dplyr::select(Country=Place, Population=X2012)
population2_2012=population %>%
  dplyr::filter(Place %in%          c('China, Hong Kong SAR',
                                    'United Kingdom',
                                    'Iran (Islamic Republic of)',
                                    'Republic of Moldova',
                                    "Dem. People's Republic of Korea",
                                    'Russian Federation',
                                    'Republic of Korea',
                                    'United States of America',
                                    'Venezuela (Bolivarian Republic of)')) %>%
  dplyr::select(Country=Place, Population=X2012)
population2_2012$Country=c('Hong Kong','North Korea','South Korea',
                           'Iran', 'Moldova',
                           'Russia', 'Great Britain',
                           'Venezuela', 'United States')
population_2012=rbind(population1_2012, population2_2012)
taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940))
population_2012=rbind(population_2012, taibei)
output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T)


#GDP
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
GDP1_2012=GDP %>%
  dplyr::filter(Country.Name %in% output2012$Country)%>%
  dplyr::select(Country=Country.Name, GDP=X2012)

merge(x=output2012, y=GDP1_2012, by='Country', all.x=T)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
  'Iran, Islamic Rep.',  'Russian Federation',
  'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
GDP2_2012=GDP %>%
  dplyr::filter(Country.Name %in% addition)%>%
  dplyr::select(Country=Country.Name, GDP=X2012)
GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
  'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela')
GDP_2012=rbind(GDP1_2012, GDP2_2012)
taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800))
GDP_2012=rbind(GDP_2012, taipei)
output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T)
output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population)
output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP)

#GDP growth
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
GDPgrowth1_2012=GDPgrowth %>%
  dplyr::filter(Country.Name %in% output2012$Country)%>%
  dplyr::select(Country=Country.Name, GDPgrowth=X2012)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.', 'Russian Federation',
           'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
GDPgrowth2_2012=GDPgrowth %>%
  dplyr::filter(Country.Name %in% addition)%>%
  dplyr::select(Country=Country.Name, GDPgrowth=X2012)

GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                    'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela')
GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012)
taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3))
GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1)
output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T)
output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth)

#life expectency
life=read.csv('life.csv', header = T, stringsAsFactors = F)
trim.leading <- function (x)  sub("^\\s+", "", x)
life$Country=trim.leading(life$Country)
life_2012=life %>%
  dplyr::filter(Country %in% output2012$Country)%>%
  dplyr::select(Country, Life)
merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>%
  dplyr::select(Country, Life)
addition=c('The Bahamas', 'United Kingdom')
life1_2012=life %>%
  dplyr::filter(Country %in% addition)%>%
  dplyr::select(Country, Life)
life1_2012$Country=c('Great Britain', 'Bahamas')
Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'),
             Life=as.numeric(79.5, 74.65))
life2_2012=rbind(life_2012,life1_2012,Taibei)
output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1,
                                  y=life2_2012, by='Country', all.x=T)
output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life)
summary(output2012_bypopulationGDP2)

#kmeans model
df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life)
chart.Correlation(df[,-1], histogram=TRUE, pch=19)
df$Population=df$Population/1000
df$GDP=df$GDP/1000
ddf=scale(df[,-1])
#d = dist(as.data.frame(scale(df[,-1])))
#fit.average =  hclust(d, method = "complete")
# par(mfrow = c(1, 1))
# plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters")
# rect.hclust(fit.average, k = 5)
set.seed(1234)
fviz_nbclust(ddf, kmeans, method = "silhouette")
# nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans")
# table(nc$Best.n[1,])
# barplot(table(nc$Best.n[1,]),
#         xlab="Numer of Clusters", ylab="Number of Criteria")
set.seed(1234)
fit.km <- kmeans(ddf, 5, nstart=25)
 fviz_cluster(fit.km, data = ddf, geom = "point",
             stand = FALSE, frame.type = "norm")
# fit.km$size
# fit.km$centers
# aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean)

df$cluster <- as.factor(fit.km$cluster)


df1=df%>%
  dplyr::filter(cluster==1)%>%
  arrange(desc(Total))

df2=df%>%
  dplyr::filter(cluster==2)%>%
  arrange(desc(Total))

df3=df%>%
  dplyr::filter(cluster==3)%>%
  arrange(desc(Total))

df4=df%>%
  dplyr::filter(cluster==4)%>%
  arrange(desc(Total))

df5=df%>%
  dplyr::filter(cluster==5)%>%
  arrange(desc(Total))


df8=rbind(df1, head(df4,5), head(df3, 5),
          head(df2, 5),
          df5)

g1=ggplot(df8, aes(Total,Population, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g2=ggplot(df8, aes(Total,GDP, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g9=ggplot(df8, aes(Total,Life, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()

multiplot(g1,g2, g7, g9,cols = 2)

g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
   geom_point()+
   geom_boxplot()+
   xlab('Cluster')+
   theme_bw()
g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
multiplot(g3,g4,g5,g8,g10, cols=2)

# 2008
output=read.csv('output.txt', header = T, stringsAsFactors = F)
output2008=filter(output, year==2008)

# population
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)

population1_2008=population %>%
  dplyr::filter(Place %in% output2008$Country)%>%
  select(Country=Place, Population=X2008)

population2_2008=population %>%
  dplyr::filter(Place %in%          c('China, Hong Kong SAR',
                                      'United Kingdom',
                                      'Iran (Islamic Republic of)',
                                      'Republic of Moldova',
                                      "Dem. People's Republic of Korea",
                                      'Russian Federation',
                                      'Republic of Korea',
                                      'United States of America',
                                      'Venezuela (Bolivarian Republic of)')) %>%
  select(Country=Place, Population=X2008)
population2_2008$Country=c('Hong Kong','North Korea','South Korea',
                           'Iran', 'Moldova',
                           'Russia', 'Great Britain',
                           'Venezuela', 'United States')
population_2008=rbind(population1_2008, population2_2008)
taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120))
population_2008=rbind(population_2008, taibei)
output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T)


#GDP
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)

GDP1_2008=GDP %>%
  dplyr::filter(Country.Name %in% output2008$Country)%>%
  select(Country=Country.Name, GDP=X2008)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.',
             'Venezuela, RB')
GDP2_2008=GDP %>%
  dplyr::filter(Country.Name %in% addition)%>%
  select(Country=Country.Name, GDP=X2008)

GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                    'Iran', 'Venezuela')
GDP_2008=rbind(GDP1_2008, GDP2_2008)
taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea',
                       'North Korea'),
             GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800))
GDP_2008=rbind(GDP_2008, taipei)
output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T)
output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population)
output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP)

#GDP growth
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)

GDPgrowth1_2008=GDPgrowth %>%
  dplyr::filter(Country.Name %in% output2008$Country)%>%
  select(Country=Country.Name, GDPgrowth=X2008)
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.',
             'Venezuela, RB')
GDPgrowth2_2008=GDPgrowth %>%
  dplyr::filter(Country.Name %in% addition)%>%
  select(Country=Country.Name, GDPgrowth=X2008)

GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                          'Iran',   'Venezuela')
GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008)
taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'),
              GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1))
GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1)
output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T)
output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth)

#life expectency
life=read.csv('life1.csv', header = T, stringsAsFactors = F)
trim.leading <- function (x)  sub("^\\s+", "", x)
life$Country=trim.leading(life$Country)
life_2008=life %>%
  dplyr::filter(Country %in% output2008$Country)%>%
  select(Country, Life)
merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>%
  select(Country, Life)
addition=c('United Kingdom')
life1_2008=life %>%
  dplyr::filter(Country %in% addition)%>%
  select(Country, Life)
life1_2008$Country=c('Great Britain')
Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands',
                       'North Korea', 'South Korea'),
             Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83))
life2_2008=rbind(life_2008,life1_2008,Taibei)
output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1,
                                  y=life2_2008, by='Country', all.x=T)
output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life)
#summary(output2008_bypopulationGDP2)

df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life)
chart.Correlation(df[,-1], histogram=TRUE, pch=19)

# kmeans model
df$Population=df$Population/1000
df$GDP=df$GDP/1000
df=df[!df$Country=='Zimbabwe',]
ddf=scale(df[,-1])
set.seed(1234)
fviz_nbclust(ddf, kmeans, method = "silhouette")
set.seed(1234)
fit.km <- kmeans(ddf, 5, nstart=25)
fviz_cluster(fit.km, data = ddf, geom = "point",
             stand = FALSE, frame.type = "norm")
df$cluster <- as.factor(fit.km$cluster)
ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',],
                          Country, Total), by='Country',
          all = T)
g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
  geom_boxplot()+
  xlab('cluster')+
  ylab('total medals of each country')+
  ggtitle('cluster vs total medals')+
  theme_bw()+
  theme(legend.position='none')

g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
multiplot(g3, g4,g5,g8,g10, cols=2)

ddf8=filter(df, Country %in% df8$Country)
g1=ggplot(ddf8, aes(Total,Population, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
g9=ggplot(ddf8, aes(Total,Life, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()

multiplot(g1,g2, g7, g9,cols = 2)
	setwd("~/Desktop/web scraping")
	library(dplyr)
	library(dygraphs)
	library(reshape2)
	library(ggplot2)
	library(plotly)
	library(car)
	library(TSA)
	library(forecast)
	library(xts)
	library(Rmisc)
	library(NbClust)
	library(cluster)
	library(factoextra)
	library(ggrepel)
	library(corrplot)
	library(PerformanceAnalytics)
	output=read.csv('output.txt', header = T, stringsAsFactors = F)
	output2012=filter(output, year==2012)
	#summary(output)
	#sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd)

	# pick the top countries by gold and total
	output_bycountry=output%>%
	dplyr::group_by(Country)%>%
	summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>%
	arrange(desc(avggold))
	head(output_bycountry,20)
	# Country totoalgold avggold
	# <chr> <int> <dbl>
	# 1 Unified Team 45 45.000000
	# 2 Soviet Union 395 43.888889
	# 3 United States 990 36.666667 (1)
	# 4 East Germany 153 30.600000
	# 5 China 201 25.125000 (2)
	# 6 Russia 134 19.142857 (3)
	# 7 Germany 213 11.210526 (4)
	# 8 West Germany 56 11.200000
	# 9 Great Britain 245 8.750000 (5)
	# 10 Italy 211 8.115385 (6)
	# 11 France 222 7.928571 (7)
	# 12 Ukraine 33 6.600000
	# 13 Hungary 170 6.538462 (8)
	# 14 Japan 130 6.500000 (9)
	# 15 Sweden 144 5.538462 (10)
	# 16 South Korea 81 5.400000 (11)
	# 17 Australia 138 5.307692 (12)
	# 18 Cuba 71 5.071429
	# 19 Romania 88 4.888889
	# 20 Finland 106 4.240000
	output_bycountry2=output%>%
	dplyr::group_by(Country)%>%
	summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>%
	arrange(desc(avgmetal))
	head(output_bycountry2,20)
	# Country totoalmetal avgmetal
	# <chr> <int> <dbl>
	# 1 Soviet Union 1010 112.22222
	# 2 Unified Team 112 112.00000
	# 3 United States 2425 89.81481 (1)
	# 4 East Germany 409 81.80000
	# 5 China 473 59.12500 (2)
	# 6 Russia 405 57.85714 (3)
	# 7 West Germany 204 40.80000
	# 8 Germany 728 38.31579 (4)
	# 9 Great Britain 811 28.96429 (5)
	# 10 France 727 25.96429 (6)
	# 11 Ukraine 115 23.00000
	# 12 Italy 577 22.19231 (7)
	# 13 Japan 400 20.00000 (8)
	# 14 Sweden 498 19.15385 (9)
	# 15 Hungary 490 18.84615 (10)
	# 16 Australia 475 18.26923 (11)
	# 17 Romania 301 16.72222
	# 18 South Korea 243 16.20000 (12)
	# 19 Belarus 75 15.00000
	# 20 Cuba 202 14.42857

	# pick the top countries
	top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain',
	'Russia', 'South Korea', 'Germany',
	'France', 'Italy', 'Hungary',
	'Japan', 'Australia', 'Brazil', 'Sweden'))

	# plot dygraphs
	df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total")
	color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be',
	'#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff')
	dygraph(df2) %>%
	dyOptions(colors = color) %>%
	dyRangeSelector()

	# ggplot of boxplot
	ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+
	geom_boxplot()+
	xlab('Country')+
	ylab('Number of total medals')+
	ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+
	theme_bw()+
	theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1))

	# population
	population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
	summary(population)

	population1_2012=population %>%
	dplyr::filter(Place %in% output2012$Country)%>%
	dplyr::select(Country=Place, Population=X2012)
	population2_2012=population %>%
	dplyr::filter(Place %in% c('China, Hong Kong SAR',
	'United Kingdom',
	'Iran (Islamic Republic of)',
	'Republic of Moldova',
	"Dem. People's Republic of Korea",
	'Russian Federation',
	'Republic of Korea',
	'United States of America',
	'Venezuela (Bolivarian Republic of)')) %>%
	dplyr::select(Country=Place, Population=X2012)
	population2_2012$Country=c('Hong Kong','North Korea','South Korea',
	'Iran', 'Moldova',
	'Russia', 'Great Britain',
	'Venezuela', 'United States')
	population_2012=rbind(population1_2012, population2_2012)
	taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940))
	population_2012=rbind(population_2012, taibei)
	output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T)


	#GDP
	GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
	GDP1_2012=GDP %>%
	dplyr::filter(Country.Name %in% output2012$Country)%>%
	dplyr::select(Country=Country.Name, GDP=X2012)

	merge(x=output2012, y=GDP1_2012, by='Country', all.x=T)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.', 'Russian Federation',
	'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
	GDP2_2012=GDP %>%
	dplyr::filter(Country.Name %in% addition)%>%
	dplyr::select(Country=Country.Name, GDP=X2012)
	GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela')
	GDP_2012=rbind(GDP1_2012, GDP2_2012)
	taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800))
	GDP_2012=rbind(GDP_2012, taipei)
	output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T)
	output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population)
	output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP)

	#GDP growth
	GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
	GDPgrowth1_2012=GDPgrowth %>%
	dplyr::filter(Country.Name %in% output2012$Country)%>%
	dplyr::select(Country=Country.Name, GDPgrowth=X2012)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.', 'Russian Federation',
	'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
	GDPgrowth2_2012=GDPgrowth %>%
	dplyr::filter(Country.Name %in% addition)%>%
	dplyr::select(Country=Country.Name, GDPgrowth=X2012)

	GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela')
	GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012)
	taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3))
	GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1)
	output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T)
	output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth)

	#life expectency
	life=read.csv('life.csv', header = T, stringsAsFactors = F)
	trim.leading <- function (x) sub("^\\s+", "", x)
	life$Country=trim.leading(life$Country)
	life_2012=life %>%
	dplyr::filter(Country %in% output2012$Country)%>%
	dplyr::select(Country, Life)
	merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>%
	dplyr::select(Country, Life)
	addition=c('The Bahamas', 'United Kingdom')
	life1_2012=life %>%
	dplyr::filter(Country %in% addition)%>%
	dplyr::select(Country, Life)
	life1_2012$Country=c('Great Britain', 'Bahamas')
	Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'),
	Life=as.numeric(79.5, 74.65))
	life2_2012=rbind(life_2012,life1_2012,Taibei)
	output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1,
	y=life2_2012, by='Country', all.x=T)
	output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life)
	summary(output2012_bypopulationGDP2)

	#kmeans model
	df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life)
	chart.Correlation(df[,-1], histogram=TRUE, pch=19)
	df$Population=df$Population/1000
	df$GDP=df$GDP/1000
	ddf=scale(df[,-1])
	#d = dist(as.data.frame(scale(df[,-1])))
	#fit.average = hclust(d, method = "complete")
	# par(mfrow = c(1, 1))
	# plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters")
	# rect.hclust(fit.average, k = 5)
	set.seed(1234)
	fviz_nbclust(ddf, kmeans, method = "silhouette")
	# nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans")
	# table(nc$Best.n[1,])
	# barplot(table(nc$Best.n[1,]),
	# xlab="Numer of Clusters", ylab="Number of Criteria")
	set.seed(1234)
	fit.km <- kmeans(ddf, 5, nstart=25)
	fviz_cluster(fit.km, data = ddf, geom = "point",
	stand = FALSE, frame.type = "norm")
	# fit.km$size
	# fit.km$centers
	# aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean)

	df$cluster <- as.factor(fit.km$cluster)


	df1=df%>%
	dplyr::filter(cluster==1)%>%
	arrange(desc(Total))

	df2=df%>%
	dplyr::filter(cluster==2)%>%
	arrange(desc(Total))

	df3=df%>%
	dplyr::filter(cluster==3)%>%
	arrange(desc(Total))

	df4=df%>%
	dplyr::filter(cluster==4)%>%
	arrange(desc(Total))

	df5=df%>%
	dplyr::filter(cluster==5)%>%
	arrange(desc(Total))


	df8=rbind(df1, head(df4,5), head(df3, 5),
	head(df2, 5),
	df5)

	g1=ggplot(df8, aes(Total,Population, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g2=ggplot(df8, aes(Total,GDP, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g9=ggplot(df8, aes(Total,Life, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()

	multiplot(g1,g2, g7, g9,cols = 2)

	g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	multiplot(g3,g4,g5,g8,g10, cols=2)

	# 2008
	output=read.csv('output.txt', header = T, stringsAsFactors = F)
	output2008=filter(output, year==2008)

	# population
	population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)

	population1_2008=population %>%
	dplyr::filter(Place %in% output2008$Country)%>%
	select(Country=Place, Population=X2008)

	population2_2008=population %>%
	dplyr::filter(Place %in% c('China, Hong Kong SAR',
	'United Kingdom',
	'Iran (Islamic Republic of)',
	'Republic of Moldova',
	"Dem. People's Republic of Korea",
	'Russian Federation',
	'Republic of Korea',
	'United States of America',
	'Venezuela (Bolivarian Republic of)')) %>%
	select(Country=Place, Population=X2008)
	population2_2008$Country=c('Hong Kong','North Korea','South Korea',
	'Iran', 'Moldova',
	'Russia', 'Great Britain',
	'Venezuela', 'United States')
	population_2008=rbind(population1_2008, population2_2008)
	taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120))
	population_2008=rbind(population_2008, taibei)
	output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T)


	#GDP
	GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)

	GDP1_2008=GDP %>%
	dplyr::filter(Country.Name %in% output2008$Country)%>%
	select(Country=Country.Name, GDP=X2008)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.',
	'Venezuela, RB')
	GDP2_2008=GDP %>%
	dplyr::filter(Country.Name %in% addition)%>%
	select(Country=Country.Name, GDP=X2008)

	GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'Venezuela')
	GDP_2008=rbind(GDP1_2008, GDP2_2008)
	taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea',
	'North Korea'),
	GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800))
	GDP_2008=rbind(GDP_2008, taipei)
	output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T)
	output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population)
	output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP)

	#GDP growth
	GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)

	GDPgrowth1_2008=GDPgrowth %>%
	dplyr::filter(Country.Name %in% output2008$Country)%>%
	select(Country=Country.Name, GDPgrowth=X2008)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.',
	'Venezuela, RB')
	GDPgrowth2_2008=GDPgrowth %>%
	dplyr::filter(Country.Name %in% addition)%>%
	select(Country=Country.Name, GDPgrowth=X2008)

	GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'Venezuela')
	GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008)
	taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'),
	GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1))
	GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1)
	output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T)
	output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth)

	#life expectency
	life=read.csv('life1.csv', header = T, stringsAsFactors = F)
	trim.leading <- function (x) sub("^\\s+", "", x)
	life$Country=trim.leading(life$Country)
	life_2008=life %>%
	dplyr::filter(Country %in% output2008$Country)%>%
	select(Country, Life)
	merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>%
	select(Country, Life)
	addition=c('United Kingdom')
	life1_2008=life %>%
	dplyr::filter(Country %in% addition)%>%
	select(Country, Life)
	life1_2008$Country=c('Great Britain')
	Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands',
	'North Korea', 'South Korea'),
	Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83))
	life2_2008=rbind(life_2008,life1_2008,Taibei)
	output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1,
	y=life2_2008, by='Country', all.x=T)
	output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life)
	#summary(output2008_bypopulationGDP2)

	df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life)
	chart.Correlation(df[,-1], histogram=TRUE, pch=19)

	# kmeans model
	df$Population=df$Population/1000
	df$GDP=df$GDP/1000
	df=df[!df$Country=='Zimbabwe',]
	ddf=scale(df[,-1])
	set.seed(1234)
	fviz_nbclust(ddf, kmeans, method = "silhouette")
	set.seed(1234)
	fit.km <- kmeans(ddf, 5, nstart=25)
	fviz_cluster(fit.km, data = ddf, geom = "point",
	stand = FALSE, frame.type = "norm")
	df$cluster <- as.factor(fit.km$cluster)
	ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',],
	Country, Total), by='Country',
	all = T)
	g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
	geom_boxplot()+
	xlab('cluster')+
	ylab('total medals of each country')+
	ggtitle('cluster vs total medals')+
	theme_bw()+
	theme(legend.position='none')

	g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	multiplot(g3, g4,g5,g8,g10, cols=2)

	ddf8=filter(df, Country %in% df8$Country)
	g1=ggplot(ddf8, aes(Total,Population, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g9=ggplot(ddf8, aes(Total,Life, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()

	multiplot(g1,g2, g7, g9,cols = 2)