Created
December 13, 2014 04:26
-
-
Save cflint12/c9b5764c10d9850fc3a3 to your computer and use it in GitHub Desktop.
World Bank Data Exploration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#In order for the Shiny app and part of this code to run, you must have installed ggvis and shiny on your computer. | |
#This will require R Base 3.1.0 or higher. | |
#If you are using Ubuntu trusty, this may require you to add an additional line to your /etc/apt/sources.list file | |
#You can do this by following the instructions below: | |
#First, type the following into your terminal: sudo gedit /etc/apt/sources.list | |
# | |
#Then, paste "deb http://cran.wustl.edu/bin/linux/ubuntu trusty/" on a new line at end of current sources.list | |
# text file in Text Editor. | |
# | |
# Save and close sources.list | |
# | |
# Don't forget to update in order to use the new repository by typing the following into your terminal | |
# | |
# sudo apt-get update | |
#If R 3.1.2 was not already installed through that process, you may need to uninstall and then reinstall R | |
#using the terminal. You can do this by typing: sudo apt-get install r-base | |
#If any of the packages below are not installed, please uncomment these lines to install them. | |
# install.packages('WDI') | |
# install.packages('reshape') | |
# install.packages('RMySQL') | |
# install.packages('dplyr') | |
# install.packages('reshape2') | |
# install.packages('Hmisc') | |
#If you are running Ubuntu, RMySQL may not have properly installed. You can re-install it | |
#by typing the following prompt into the terminal: sudo apt-get install r-cran-rmysql | |
#First set your working directory to the appropriate place | |
setwd('/home/cflint/Documents/Individual_Project') | |
#Next load the necessary libraries | |
library('WDI') | |
library('ggvis') | |
library('reshape') | |
library('RMySQL') | |
library('dplyr') | |
library('reshape2') | |
library('Hmisc') | |
#Pull economic indicators of interest and include "extra" for classification purposes | |
WB_DF <- WDI(country = 'all', indicator = c("NY.GDP.MKTP.CD", "NV.AGR.TOTL.ZS", "NV.IND.TOTL.ZS", "NV.IND.MANF.ZS", "NV.SRV.TETC.ZS", | |
"SP.POP.TOTL", "EN.POP.DNST", "NY.GNP.MKTP.PP.CD", "NY.GNP.PCAP.PP.CD"), | |
start = 1984, end = 2013, extra = TRUE, cache = NULL) | |
#Next we will get rid of the variables that are not of interest to us | |
WB_DF['iso3c'] <- NULL #An extra country code | |
WB_DF['capital'] <- NULL #The capital city of the country | |
WB_DF['longitude'] <- NULL #The longitude | |
WB_DF['latitude'] <- NULL #The latitude | |
WB_DF['lending'] <- NULL #The lending | |
#Create more intuitive column names | |
Columns <- c("Short_Key", "Long_Name", "Year", "GDP", "Agriculture_Pct_of_GDP", "Industry_Pct_of_GDP", "Manufacturing_Pct_of_GDP", "Services_Pct_of_GDP", | |
"Total_Population", "Population_Density_Per_Sq_km", "GNI_Purchasing_Power", "Purchasing_Power_Per_Capita", "Region", "Income") | |
#Add the column names to the data frame | |
colnames(WB_DF) <- Columns | |
#Simplify the income groupings | |
WB_DF$Income <- as.character(WB_DF$Income) | |
WB_DF$Income[WB_DF$Income == 'High income: nonOECD'] <- 'High Income' | |
WB_DF$Income[WB_DF$Income == 'High income: OECD'] <- 'High Income' | |
WB_DF$Income <- as.factor(WB_DF$Income) | |
#Shorten the region names | |
WB_DF$Region <- as.character(WB_DF$Region) | |
WB_DF$Region[WB_DF$Region == 'Europe & Central Asia (all income levels)'] <- 'Europe & Central Asia' | |
WB_DF$Region[WB_DF$Region == 'Middle East & North Africa (all income levels)'] <- 'Middle East & North Africa' | |
WB_DF$Region[WB_DF$Region == 'Latin America & Caribbean (all income levels)'] <- 'Latin America & Caribbean' | |
WB_DF$Region[WB_DF$Region == 'Sub-Saharan Africa (all income levels)'] <- 'Sub-Saharan Africa' | |
WB_DF$Region[WB_DF$Region == 'East Asia & Pacific (all income levels)'] <- 'East Asia & Pacific' | |
WB_DF$Region <- as.factor(WB_DF$Region) | |
#Look at some descriptive statistics of the data | |
describe(WB_DF) | |
#Unfortunately, it appears that much of the data is missing | |
#Roughly 40% of the GDP breakdown variables (services, manufacturing, etc.) are missing | |
#About half of the GNI and income per capita is also missing, which makes it very difficult | |
#to draw trends from. | |
#Only keep the regional aggregations in the Region data frame | |
Region_DF <- subset(WB_DF, WB_DF$Long_Name == 'Europe & Central Asia (all income levels)' | | |
WB_DF$Long_Name == 'Middle East & North Africa (all income levels)' | | |
WB_DF$Long_Name == 'South Asia' | | |
WB_DF$Long_Name == 'Latin America & Caribbean (all income levels)' | | |
WB_DF$Long_Name == 'Sub-Saharan Africa (all income levels)' | | |
WB_DF$Long_Name == 'East Asia & Pacific (all income levels)' | | |
WB_DF$Long_Name == 'North America') | |
#Shorten the region names | |
Region_DF$Long_Name <- as.character(Region_DF$Long_Name) | |
Region_DF$Long_Name[Region_DF$Long_Name == 'Europe & Central Asia (all income levels)'] <- 'Europe & Central Asia' | |
Region_DF$Long_Name[Region_DF$Long_Name == 'Middle East & North Africa (all income levels)'] <- 'Middle East & North Africa' | |
Region_DF$Long_Name[Region_DF$Long_Name == 'Latin America & Caribbean (all income levels)'] <- 'Latin America & Caribbean' | |
Region_DF$Long_Name[Region_DF$Long_Name == 'Sub-Saharan Africa (all income levels)'] <- 'Sub-Saharan Africa' | |
Region_DF$Long_Name[Region_DF$Long_Name == 'East Asia & Pacific (all income levels)'] <- 'East Asia & Pacific' | |
Region_DF$Long_Name <- as.factor(Region_DF$Long_Name) | |
#Make a separate data frame for the income aggregations | |
Income_DF <- subset(WB_DF, WB_DF$Long_Name == 'High income' | | |
WB_DF$Long_Name == 'Low income' | | |
WB_DF$Long_Name == 'Upper middle income' | | |
WB_DF$Long_Name == 'Lower middle income' | | |
WB_DF$Long_Name == 'Not classified') | |
#Separate the country rows from the non-country tables | |
Country_DF <- subset(WB_DF, WB_DF$Region != 'Aggregates') | |
#Re-order the dataframe to ensure that it is in alphabetical order, and more importantly, | |
#to ensure that the years are in chronological order | |
Country_DF <- Country_DF[order(Country_DF$Long_Name, Country_DF$Year),] | |
#Reshape the data frame to get a holistic view of how GDP has changed over time for all countries | |
Country_GDP_DF <- reshape(Country_DF[,c("Long_Name","Year","GDP")], v.names = "GDP", idvar = "Long_Name", | |
timevar = "Year", direction = "wide") | |
#Rename the columns | |
Grouped_Table_Columns <- c("Long_Name", seq(1984,2013)) | |
colnames(Country_GDP_DF) <- Grouped_Table_Columns | |
#Connect to MySQL and create a database | |
#Be sure to change to match the credentials for your own database | |
con <- dbConnect(MySQL(), user="root", password="root", | |
host="localhost") | |
dbGetQuery(con, "CREATE DATABASE Individual_Project_2014") | |
dbGetQuery(con, "USE Individual_Project_2014") | |
#Make sure there are no existing tables | |
dbListTables(con) | |
#Write the tables of interest from the data frames created earlier | |
dbWriteTable(con, "Country_Table", Country_DF) | |
dbWriteTable(con, "Income_Table", Income_DF) | |
dbWriteTable(con, "Region_Table", Region_DF) | |
dbWriteTable(con, "Country_GDP_Table", Country_GDP_DF) | |
#Look to make sure tables were written correctly | |
dbListTables(con) | |
#Create a new table to track the minimum, average and maximum GDP for each country | |
dbGetQuery(con, "CREATE TABLE GDP_Summary_Table (Short_Key VARCHAR(50), Long_Name VARCHAR(50), | |
Avg_GDP BIGINT NOT NULL, Min_GDP BIGINT NOT NULL, Max_GDP BIGINT NOT NULL)") | |
#Load the table with data | |
dbGetQuery(con, "INSERT INTO GDP_Summary_Table SELECT Short_Key, Long_Name, Avg(GDP) AS Avg_GDP, Min(GDP) AS Min_GDP, Max(GDP) AS Max_GDP | |
FROM Country_Table WHERE GDP <> 'NA' GROUP BY Long_Name") | |
#Inner join the country, income and region tables to build out more information about the groupings each country is placed in. | |
Final_WB_DF <- dbGetQuery(con, "Select Country_Table.Short_Key, Country_Table.Long_Name, Country_Table.Year, | |
Country_Table.GDP AS Country_GDP, Country_Table.Total_Population AS Country_Population, | |
Country_Table.Population_Density_Per_Sq_km AS Country_Pop_Dens, Country_Table.Region, | |
Country_Table.Income, Income_Table.GDP AS Income_Group_GDP, Income_Table.Total_Population AS Income_Group_Population, | |
Income_Table.Population_Density_Per_Sq_km AS Income_Group_Pop_Dens, Region_Table.GDP AS Region_GDP, | |
Region_Table.Total_Population AS Region_Population, Region_Table.Population_Density_Per_Sq_km AS Region_Pop_Dens | |
FROM Income_Table INNER JOIN Country_Table ON Country_Table.Income=Income_Table.Long_Name | |
AND Country_Table.Year=Income_Table.Year INNER JOIN Region_Table ON Country_Table.Region = Region_Table.Long_Name | |
AND Country_Table.Year = Region_Table.Year WHERE Country_Table.GDP <> 'NA'") | |
#Write the new table to a data frame to export for use within the Shiny App | |
Final_WB_DF <- Final_WB_DF[order(Final_WB_DF$Long_Name, Final_WB_DF$Year),] | |
#Write the new data frame as a csv file to place within the app folder. | |
write.csv(Final_WB_DF, file = 'Final_WB_DF.csv') | |
#Test out visualizations to explore the data further and find what type of visualization would be best for the app. | |
Test <- filter(Final_WB_DF, Final_WB_DF$Long_Name == 'Uruguay') | |
Test %>% | |
ggvis(x = ~Year, y = ~Country_GDP) %>% | |
layer_paths(stroke := 'green') %>% | |
layer_paths(x = ~Year, y = ~Income_Group_GDP, stroke := 'red') %>% | |
layer_paths(x = ~Year, y = ~Region_GDP, stroke := 'blue') %>% | |
add_axis("x", title = 'Year') %>% | |
add_axis("y", title = 'GDP') %>% | |
set_options(width = 400, height = 200) | |
#Let's explore the relationship between income group and population density | |
Income_Pop_Test <- filter(Final_WB_DF, Final_WB_DF$Year == 2013) | |
boxplot(Income_Pop_Test$Country_Pop_Dens~Income_Pop_Test$Income, outline = FALSE) | |
#Next, let's take a look at some charts that show the relationship between the type of product | |
#(services, manufacturing, agriculture, etc.) that countries produce and their overall GDP performance. | |
Filtered_Country_DF <- filter(Country_DF, is.na(Services_Pct_of_GDP) == FALSE & is.na(GDP) == FALSE) | |
Filtered_Country_DF <- filter(Country_DF, Income != 'Aggregates') | |
Filtered_Country_DF <- filter(Country_DF, Income != 'Not Classified') | |
#Take away a few zeroes, so that it looks nice on the charts | |
Filtered_Country_DF$GDP <- Filtered_Country_DF$GDP/1000000000000 | |
#We see from these plots that higher-income countries tend to provide more services compared to their counterparts | |
Filtered_Country_DF %>% | |
ggvis(x = ~Services_Pct_of_GDP, y = ~GDP) %>% | |
layer_points(fill = ~Income, stroke = ~Income) %>% | |
add_axis("x", title = 'Pct Services') %>% | |
add_axis("y", title = 'Total GDP (Trillions)') | |
#Taking another look, we see that region has a lot to do with the composition of a country's GDP. | |
#For example, it is clear that North America focuses on services, as doeer Europe. | |
Filtered_Country_DF %>% | |
ggvis(x = ~Services_Pct_of_GDP, y = ~GDP) %>% | |
layer_points(fill = ~Region, stroke = ~Region) %>% | |
add_axis("x", title = 'Pct Services') %>% | |
add_axis("y", title = 'Total GDP (Trillions)') | |
Filtered_Country_DF %>% | |
ggvis(x = ~Manufacturing_Pct_of_GDP, y = ~GDP) %>% | |
layer_points(fill = ~Income, stroke = ~Income) %>% | |
add_axis("x", title = 'Pct Manufacturing') %>% | |
add_axis("y", title = 'Total GDP (Trillions)') | |
Filtered_Country_DF %>% | |
ggvis(x = ~Agriculture_Pct_of_GDP, y = ~GDP) %>% | |
layer_points(fill = ~Income, stroke = ~Income) %>% | |
add_axis("x", title = 'Pct Agriculture') %>% | |
add_axis("y", title = 'Total GDP (Trillions)') | |
Filtered_Country_DF %>% | |
ggvis(x = ~Industry_Pct_of_GDP, y = ~GDP) %>% | |
layer_points(fill = ~Income, stroke = ~Income) %>% | |
add_axis("x", title = 'Pct Industry') %>% | |
add_axis("y", title = 'Total GDP (Trillions)') | |
#Next, we compare purchasing power per capita with population density per square km | |
Filtered_Country_DF %>% | |
ggvis(x = ~Purchasing_Power_Per_Capita, y = ~Population_Density_Per_Sq_km) %>% | |
layer_points(fill = ~Income, stroke = ~Income) %>% | |
add_axis("x", title = 'Purchasing Power per Capita') %>% | |
add_axis("y", title = 'Pop. Density per Sq. km') | |
#Next, we compare purchasing power per capita with population density per square km | |
Filtered_Country_DF %>% | |
ggvis(x = ~Purchasing_Power_Per_Capita, y = ~Population_Density_Per_Sq_km) %>% | |
layer_points(fill = ~Region, stroke = ~Region) %>% | |
add_axis("x", title = 'Purchasing Power per Capita') %>% | |
add_axis("y", title = 'Pop. Density per Sq. km') | |
Filtered_Country_DF$Income <- factor(Filtered_Country_DF$Income, levels(Filtered_Country_DF$Income)[c(4,3,6,2,1,5)]) | |
boxplot(Filtered_Country_DF$Services_Pct_of_GDP~Filtered_Country_DF$Income, outline = TRUE, | |
main = 'Services as Percent of GDP', col = 'light blue') | |
cor(Filtered_Country_DF$Services_Pct_of_GDP, Filtered_Country_DF$GDP) | |
hist(Filtered_Country_DF$Services_Pct_of_GDP, main = 'Histogram of Services as Pct. of GDP', | |
col = 'light blue', xlab = 'Services as Pct. of GDP') | |
Income_Group_Test <- filter(Final_WB_DF, Final_WB_DF$Income == 'High Income') | |
Test %>% | |
ggvis(x = ~Year, y = ~Country_GDP) %>% | |
layer_paths() %>% | |
layer_paths(x = ~Year, y = ~Income_Group_GDP) %>% | |
layer_paths(x = ~Year, y = ~Region_GDP) %>% | |
add_axis("x", title = 'Year') %>% | |
add_axis("y", title = 'GDP') %>% | |
set_options(width = 400, height = 200) | |
dbDisconnect(con) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment