Skip to content

Instantly share code, notes, and snippets.

@yyuan123
Last active September 2, 2015 13:14
Show Gist options
  • Save yyuan123/2da5cc0844756fbf705d to your computer and use it in GitHub Desktop.
Save yyuan123/2da5cc0844756fbf705d to your computer and use it in GitHub Desktop.
Analysis of Potential Indicators and influencers on Poverty among G20 countries (Programming Class Individual Project)
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 08 22:57:38 2014
@author: Ye Yuan
"""
import wbdata
import pandas as pd
import MySQLdb as myDB
import matplotlib.pyplot as plt
# Begin with an overview of available sources
# wbdata.get_source()
# Target on one source to get IDs
# wbdata.get_indicator(source=2)
# wbdata.get_indicator(source=16)
# Search for indicators
# wbdata.search_indicators("Working age")
# Search for G20 countriy codes
# wbdata.get_country()
# Restrict countries and time period and select indicators
from datetime import datetime as dt
ndate = (dt(2012,01,01), dt(2003,01,01))
indicators = {
"SI.POV.5DAY" : "Poverty_Headcount",
"SI.POV.GAP5" : "Poverty_gap",
"SP.POP.DPND" : "Dependency_Ratio",
"SP.RUR.TOTL.ZS" : "Rural_Population",
"SH.XPD.PUBL.ZS" : "Health_Expenditure",
"SE.TER.ENRR" : "Enrollment_Ter",
"SE.SEC.ENRR" : "Enrollment_Sec",
"SE.PRM.ENRR" : "Enrollment_Pri",
"SL.UEM.LTRM.ZS" : "Unemployment",
"SE.XPD.TOTL.GD.ZS" : "Education_Expenditure",
"SH.STA.ACSN" : "Sanitation_Facilities"
}
countries = ["ARG", "AUS", "BRA", "CAN", "CHN", "DEU", "FRA", "IND",
"IDN", "ITA", "JPN", "MEX", "RUS", "SAU", "KOR", "GBR",
"TUR", "USA", "ZAF"]
myWB = wbdata.get_dataframe(indicators, country=countries,
convert_date=False, data_date=ndate)
df = pd.DataFrame(myWB)
table = 'Poverty_Predictions'
mysql_host = "localhost"
mysql_user = "root"
mysql_pass = "jimmyjimmy"
mysql_db = "WBjimmy"
# To save a dataframe to CSV
df.to_csv('C:/Users/Ye Yuan/Desktop' + '/' + table + '.csv')
# To connect MySQL Database
dbConnect = myDB.connect(host = mysql_host,
user = mysql_user,
passwd = mysql_pass,
db = mysql_db)
# To save DataFrame to MySQL database
dfClean = df.where((pd.notnull(df)), None)
dfClean.to_sql(con = dbConnect,
name = table,
if_exists='replace',
flavor='mysql')
df = pd.read_sql("SELECT * from Poverty_Predictions", dbConnect)
# Plotting with Python using matplotlib
indicator1 = {"SI.POV.5DAY":"PHR5"}
indicator2 = {"SI.POV.GAP5":"PG5"}
countries1 = ["ARG", "BRA", "RUS", "TUR"]
#grab indicators above for countires above and load into data frame
df1 = wbdata.get_dataframe(indicator1, country=countries1,
convert_date=False, data_date=ndate)
df2 = wbdata.get_dataframe(indicator2, country=countries1,
convert_date=False, data_date=ndate)
#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable
dfu1 = df1.unstack(level=0)
dfu2 = df2.unstack(level=0)
# Plot of Poverty headcount ratio at $5 a day
dfu1.plot()
plt.legend(loc='best')
plt.title("Poverty headcount ratio at $5 a day (PPP) (% of population)")
plt.xlabel('Date'); plt.ylabel('Poverty headcount ratio at $5 a day %')
dfu2.plot()
plt.legend(loc='best')
plt.title("Poverty Gap at $5 a day (PPP) (% of population)")
plt.xlabel('Date'); plt.ylabel('Poverty Gap at $5 a day %')
library('DBI')
library('RMySQL')
library('ggplot2')
#Connecting to MySQL Database
con <- dbConnect(MySQL(),
user="root",
password="jimmyjimmy",
dbname="wbjimmy",
host="localhost")
#Show "Poverty_Predictions" dataset and List fields
dbListTables(con)
dbListFields(con, "Poverty_Predictions")
#Import the dataset "Poverty_Predictions"
dataset <- dbReadTable(con, "Poverty_Predictions")
#Export dataset from MySQL database
write.csv(dataset, file = "Poverty_Predictions.csv")
#Import and Load dataset exported
data <- read.table("C:/Users/Ye Yuan/Documents/Poverty_Predictions.csv",header=TRUE, sep=",")
#Subsetting datasets, have tried regression on the subset but no ideal results
subset <- subset(data, (data$country=="Argentina"|
data$country=="Brazil"|
data$country=="Russian Federation"|
data$country=="Turkey"))
# Fitting the model
# Mutiple Regression on Poverty_Headcount Ratio by predictors
# by all predictors
fitTotalAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
+ data$Unemployment + data$Health_Expenditure
+ data$Rural_Population + data$Education_Expenditure
+ data$Sanitation_Facilities + data$Enrollment_Pri
+ data$Enrollment_Sec + data$Enrollment_Ter, data=data)
summary(fitTotalAll)
# by all predictors with private attribute
fitPrivateAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
+ data$Unemployment + data$Enrollment_Pri
+ data$Enrollment_Sec + data$Enrollment_Ter, data=data)
summary(fitPrivateAll)
# by all predictors with public attribute
fitPublicAll <- lm(data$Poverty_Headcount ~ data$Health_Expenditure
+ data$Rural_Population + data$Education_Expenditure
+ data$Sanitation_Facilities, data=data)
summary(fitPublicAll)
#GGPLOTS
d0 <- ggplot(data=data,aes_string(x='country',y='Poverty_Headcount'))
d0 + geom_boxplot()
d1 <- ggplot(data, aes(x=data$Dependency_Ratio, y=data$Poverty_Headcount))
d1 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
d2 <- ggplot(data, aes(x=data$Health_Expenditure, y=data$Poverty_Headcount))
d2 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
d3 <- ggplot(data, aes(x=data$Rural_Population, y=data$Poverty_Headcount))
d3 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
d4 <- ggplot(data, aes(x=data$Enrollment_Pri, y=data$Poverty_Headcount))
d4 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
d5 <- ggplot(data, aes(x=data$Enrollment_Sec, y=data$Poverty_Headcount))
d5 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
d6 <- ggplot(data, aes(x=data$Enrollment_Ter, y=data$Poverty_Headcount))
d6 + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment