Skip to content

Instantly share code, notes, and snippets.

@cflint12
Created December 13, 2014 04:26
Show Gist options
  • Save cflint12/c9b5764c10d9850fc3a3 to your computer and use it in GitHub Desktop.
Save cflint12/c9b5764c10d9850fc3a3 to your computer and use it in GitHub Desktop.
World Bank Data Exploration
#In order for the Shiny app and part of this code to run, you must have installed ggvis and shiny on your computer.
#This will require R Base 3.1.0 or higher.
#If you are using Ubuntu trusty, this may require you to add an additional line to your /etc/apt/sources.list file
#You can do this by following the instructions below:
#First, type the following into your terminal: sudo gedit /etc/apt/sources.list
#
#Then, paste "deb http://cran.wustl.edu/bin/linux/ubuntu trusty/" on a new line at end of current sources.list
# text file in Text Editor.
#
# Save and close sources.list
#
# Don't forget to update in order to use the new repository by typing the following into your terminal
#
# sudo apt-get update
#If R 3.1.2 was not already installed through that process, you may need to uninstall and then reinstall R
#using the terminal. You can do this by typing: sudo apt-get install r-base
#If any of the packages below are not installed, please uncomment these lines to install them.
# install.packages('WDI')
# install.packages('reshape')
# install.packages('RMySQL')
# install.packages('dplyr')
# install.packages('reshape2')
# install.packages('Hmisc')
#If you are running Ubuntu, RMySQL may not have properly installed. You can re-install it
#by typing the following prompt into the terminal: sudo apt-get install r-cran-rmysql
#First set your working directory to the appropriate place
setwd('/home/cflint/Documents/Individual_Project')
#Next load the necessary libraries
library('WDI')
library('ggvis')
library('reshape')
library('RMySQL')
library('dplyr')
library('reshape2')
library('Hmisc')
#Pull economic indicators of interest and include "extra" for classification purposes
WB_DF <- WDI(country = 'all', indicator = c("NY.GDP.MKTP.CD", "NV.AGR.TOTL.ZS", "NV.IND.TOTL.ZS", "NV.IND.MANF.ZS", "NV.SRV.TETC.ZS",
"SP.POP.TOTL", "EN.POP.DNST", "NY.GNP.MKTP.PP.CD", "NY.GNP.PCAP.PP.CD"),
start = 1984, end = 2013, extra = TRUE, cache = NULL)
#Next we will get rid of the variables that are not of interest to us
WB_DF['iso3c'] <- NULL #An extra country code
WB_DF['capital'] <- NULL #The capital city of the country
WB_DF['longitude'] <- NULL #The longitude
WB_DF['latitude'] <- NULL #The latitude
WB_DF['lending'] <- NULL #The lending
#Create more intuitive column names
Columns <- c("Short_Key", "Long_Name", "Year", "GDP", "Agriculture_Pct_of_GDP", "Industry_Pct_of_GDP", "Manufacturing_Pct_of_GDP", "Services_Pct_of_GDP",
"Total_Population", "Population_Density_Per_Sq_km", "GNI_Purchasing_Power", "Purchasing_Power_Per_Capita", "Region", "Income")
#Add the column names to the data frame
colnames(WB_DF) <- Columns
#Simplify the income groupings
WB_DF$Income <- as.character(WB_DF$Income)
WB_DF$Income[WB_DF$Income == 'High income: nonOECD'] <- 'High Income'
WB_DF$Income[WB_DF$Income == 'High income: OECD'] <- 'High Income'
WB_DF$Income <- as.factor(WB_DF$Income)
#Shorten the region names
WB_DF$Region <- as.character(WB_DF$Region)
WB_DF$Region[WB_DF$Region == 'Europe & Central Asia (all income levels)'] <- 'Europe & Central Asia'
WB_DF$Region[WB_DF$Region == 'Middle East & North Africa (all income levels)'] <- 'Middle East & North Africa'
WB_DF$Region[WB_DF$Region == 'Latin America & Caribbean (all income levels)'] <- 'Latin America & Caribbean'
WB_DF$Region[WB_DF$Region == 'Sub-Saharan Africa (all income levels)'] <- 'Sub-Saharan Africa'
WB_DF$Region[WB_DF$Region == 'East Asia & Pacific (all income levels)'] <- 'East Asia & Pacific'
WB_DF$Region <- as.factor(WB_DF$Region)
#Look at some descriptive statistics of the data
describe(WB_DF)
#Unfortunately, it appears that much of the data is missing
#Roughly 40% of the GDP breakdown variables (services, manufacturing, etc.) are missing
#About half of the GNI and income per capita is also missing, which makes it very difficult
#to draw trends from.
#Only keep the regional aggregations in the Region data frame
Region_DF <- subset(WB_DF, WB_DF$Long_Name == 'Europe & Central Asia (all income levels)' |
WB_DF$Long_Name == 'Middle East & North Africa (all income levels)' |
WB_DF$Long_Name == 'South Asia' |
WB_DF$Long_Name == 'Latin America & Caribbean (all income levels)' |
WB_DF$Long_Name == 'Sub-Saharan Africa (all income levels)' |
WB_DF$Long_Name == 'East Asia & Pacific (all income levels)' |
WB_DF$Long_Name == 'North America')
#Shorten the region names
Region_DF$Long_Name <- as.character(Region_DF$Long_Name)
Region_DF$Long_Name[Region_DF$Long_Name == 'Europe & Central Asia (all income levels)'] <- 'Europe & Central Asia'
Region_DF$Long_Name[Region_DF$Long_Name == 'Middle East & North Africa (all income levels)'] <- 'Middle East & North Africa'
Region_DF$Long_Name[Region_DF$Long_Name == 'Latin America & Caribbean (all income levels)'] <- 'Latin America & Caribbean'
Region_DF$Long_Name[Region_DF$Long_Name == 'Sub-Saharan Africa (all income levels)'] <- 'Sub-Saharan Africa'
Region_DF$Long_Name[Region_DF$Long_Name == 'East Asia & Pacific (all income levels)'] <- 'East Asia & Pacific'
Region_DF$Long_Name <- as.factor(Region_DF$Long_Name)
#Make a separate data frame for the income aggregations
Income_DF <- subset(WB_DF, WB_DF$Long_Name == 'High income' |
WB_DF$Long_Name == 'Low income' |
WB_DF$Long_Name == 'Upper middle income' |
WB_DF$Long_Name == 'Lower middle income' |
WB_DF$Long_Name == 'Not classified')
#Separate the country rows from the non-country tables
Country_DF <- subset(WB_DF, WB_DF$Region != 'Aggregates')
#Re-order the dataframe to ensure that it is in alphabetical order, and more importantly,
#to ensure that the years are in chronological order
Country_DF <- Country_DF[order(Country_DF$Long_Name, Country_DF$Year),]
#Reshape the data frame to get a holistic view of how GDP has changed over time for all countries
Country_GDP_DF <- reshape(Country_DF[,c("Long_Name","Year","GDP")], v.names = "GDP", idvar = "Long_Name",
timevar = "Year", direction = "wide")
#Rename the columns
Grouped_Table_Columns <- c("Long_Name", seq(1984,2013))
colnames(Country_GDP_DF) <- Grouped_Table_Columns
#Connect to MySQL and create a database
#Be sure to change to match the credentials for your own database
con <- dbConnect(MySQL(), user="root", password="root",
host="localhost")
dbGetQuery(con, "CREATE DATABASE Individual_Project_2014")
dbGetQuery(con, "USE Individual_Project_2014")
#Make sure there are no existing tables
dbListTables(con)
#Write the tables of interest from the data frames created earlier
dbWriteTable(con, "Country_Table", Country_DF)
dbWriteTable(con, "Income_Table", Income_DF)
dbWriteTable(con, "Region_Table", Region_DF)
dbWriteTable(con, "Country_GDP_Table", Country_GDP_DF)
#Look to make sure tables were written correctly
dbListTables(con)
#Create a new table to track the minimum, average and maximum GDP for each country
dbGetQuery(con, "CREATE TABLE GDP_Summary_Table (Short_Key VARCHAR(50), Long_Name VARCHAR(50),
Avg_GDP BIGINT NOT NULL, Min_GDP BIGINT NOT NULL, Max_GDP BIGINT NOT NULL)")
#Load the table with data
dbGetQuery(con, "INSERT INTO GDP_Summary_Table SELECT Short_Key, Long_Name, Avg(GDP) AS Avg_GDP, Min(GDP) AS Min_GDP, Max(GDP) AS Max_GDP
FROM Country_Table WHERE GDP <> 'NA' GROUP BY Long_Name")
#Inner join the country, income and region tables to build out more information about the groupings each country is placed in.
Final_WB_DF <- dbGetQuery(con, "Select Country_Table.Short_Key, Country_Table.Long_Name, Country_Table.Year,
Country_Table.GDP AS Country_GDP, Country_Table.Total_Population AS Country_Population,
Country_Table.Population_Density_Per_Sq_km AS Country_Pop_Dens, Country_Table.Region,
Country_Table.Income, Income_Table.GDP AS Income_Group_GDP, Income_Table.Total_Population AS Income_Group_Population,
Income_Table.Population_Density_Per_Sq_km AS Income_Group_Pop_Dens, Region_Table.GDP AS Region_GDP,
Region_Table.Total_Population AS Region_Population, Region_Table.Population_Density_Per_Sq_km AS Region_Pop_Dens
FROM Income_Table INNER JOIN Country_Table ON Country_Table.Income=Income_Table.Long_Name
AND Country_Table.Year=Income_Table.Year INNER JOIN Region_Table ON Country_Table.Region = Region_Table.Long_Name
AND Country_Table.Year = Region_Table.Year WHERE Country_Table.GDP <> 'NA'")
#Write the new table to a data frame to export for use within the Shiny App
Final_WB_DF <- Final_WB_DF[order(Final_WB_DF$Long_Name, Final_WB_DF$Year),]
#Write the new data frame as a csv file to place within the app folder.
write.csv(Final_WB_DF, file = 'Final_WB_DF.csv')
#Test out visualizations to explore the data further and find what type of visualization would be best for the app.
Test <- filter(Final_WB_DF, Final_WB_DF$Long_Name == 'Uruguay')
Test %>%
ggvis(x = ~Year, y = ~Country_GDP) %>%
layer_paths(stroke := 'green') %>%
layer_paths(x = ~Year, y = ~Income_Group_GDP, stroke := 'red') %>%
layer_paths(x = ~Year, y = ~Region_GDP, stroke := 'blue') %>%
add_axis("x", title = 'Year') %>%
add_axis("y", title = 'GDP') %>%
set_options(width = 400, height = 200)
#Let's explore the relationship between income group and population density
Income_Pop_Test <- filter(Final_WB_DF, Final_WB_DF$Year == 2013)
boxplot(Income_Pop_Test$Country_Pop_Dens~Income_Pop_Test$Income, outline = FALSE)
#Next, let's take a look at some charts that show the relationship between the type of product
#(services, manufacturing, agriculture, etc.) that countries produce and their overall GDP performance.
Filtered_Country_DF <- filter(Country_DF, is.na(Services_Pct_of_GDP) == FALSE & is.na(GDP) == FALSE)
Filtered_Country_DF <- filter(Country_DF, Income != 'Aggregates')
Filtered_Country_DF <- filter(Country_DF, Income != 'Not Classified')
#Take away a few zeroes, so that it looks nice on the charts
Filtered_Country_DF$GDP <- Filtered_Country_DF$GDP/1000000000000
#We see from these plots that higher-income countries tend to provide more services compared to their counterparts
Filtered_Country_DF %>%
ggvis(x = ~Services_Pct_of_GDP, y = ~GDP) %>%
layer_points(fill = ~Income, stroke = ~Income) %>%
add_axis("x", title = 'Pct Services') %>%
add_axis("y", title = 'Total GDP (Trillions)')
#Taking another look, we see that region has a lot to do with the composition of a country's GDP.
#For example, it is clear that North America focuses on services, as doeer Europe.
Filtered_Country_DF %>%
ggvis(x = ~Services_Pct_of_GDP, y = ~GDP) %>%
layer_points(fill = ~Region, stroke = ~Region) %>%
add_axis("x", title = 'Pct Services') %>%
add_axis("y", title = 'Total GDP (Trillions)')
Filtered_Country_DF %>%
ggvis(x = ~Manufacturing_Pct_of_GDP, y = ~GDP) %>%
layer_points(fill = ~Income, stroke = ~Income) %>%
add_axis("x", title = 'Pct Manufacturing') %>%
add_axis("y", title = 'Total GDP (Trillions)')
Filtered_Country_DF %>%
ggvis(x = ~Agriculture_Pct_of_GDP, y = ~GDP) %>%
layer_points(fill = ~Income, stroke = ~Income) %>%
add_axis("x", title = 'Pct Agriculture') %>%
add_axis("y", title = 'Total GDP (Trillions)')
Filtered_Country_DF %>%
ggvis(x = ~Industry_Pct_of_GDP, y = ~GDP) %>%
layer_points(fill = ~Income, stroke = ~Income) %>%
add_axis("x", title = 'Pct Industry') %>%
add_axis("y", title = 'Total GDP (Trillions)')
#Next, we compare purchasing power per capita with population density per square km
Filtered_Country_DF %>%
ggvis(x = ~Purchasing_Power_Per_Capita, y = ~Population_Density_Per_Sq_km) %>%
layer_points(fill = ~Income, stroke = ~Income) %>%
add_axis("x", title = 'Purchasing Power per Capita') %>%
add_axis("y", title = 'Pop. Density per Sq. km')
#Next, we compare purchasing power per capita with population density per square km
Filtered_Country_DF %>%
ggvis(x = ~Purchasing_Power_Per_Capita, y = ~Population_Density_Per_Sq_km) %>%
layer_points(fill = ~Region, stroke = ~Region) %>%
add_axis("x", title = 'Purchasing Power per Capita') %>%
add_axis("y", title = 'Pop. Density per Sq. km')
Filtered_Country_DF$Income <- factor(Filtered_Country_DF$Income, levels(Filtered_Country_DF$Income)[c(4,3,6,2,1,5)])
boxplot(Filtered_Country_DF$Services_Pct_of_GDP~Filtered_Country_DF$Income, outline = TRUE,
main = 'Services as Percent of GDP', col = 'light blue')
cor(Filtered_Country_DF$Services_Pct_of_GDP, Filtered_Country_DF$GDP)
hist(Filtered_Country_DF$Services_Pct_of_GDP, main = 'Histogram of Services as Pct. of GDP',
col = 'light blue', xlab = 'Services as Pct. of GDP')
Income_Group_Test <- filter(Final_WB_DF, Final_WB_DF$Income == 'High Income')
Test %>%
ggvis(x = ~Year, y = ~Country_GDP) %>%
layer_paths() %>%
layer_paths(x = ~Year, y = ~Income_Group_GDP) %>%
layer_paths(x = ~Year, y = ~Region_GDP) %>%
add_axis("x", title = 'Year') %>%
add_axis("y", title = 'GDP') %>%
set_options(width = 400, height = 200)
dbDisconnect(con)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment