Last active
September 2, 2015 13:14
-
-
Save yyuan123/2da5cc0844756fbf705d to your computer and use it in GitHub Desktop.
Analysis of Potential Indicators and influencers on Poverty among G20 countries (Programming Class Individual Project)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Dec 08 22:57:38 2014 | |
@author: Ye Yuan | |
""" | |
import wbdata | |
import pandas as pd | |
import MySQLdb as myDB | |
import matplotlib.pyplot as plt | |
# Begin with an overview of available sources | |
# wbdata.get_source() | |
# Target on one source to get IDs | |
# wbdata.get_indicator(source=2) | |
# wbdata.get_indicator(source=16) | |
# Search for indicators | |
# wbdata.search_indicators("Working age") | |
# Search for G20 countriy codes | |
# wbdata.get_country() | |
# Restrict countries and time period and select indicators | |
from datetime import datetime as dt | |
ndate = (dt(2012,01,01), dt(2003,01,01)) | |
indicators = { | |
"SI.POV.5DAY" : "Poverty_Headcount", | |
"SI.POV.GAP5" : "Poverty_gap", | |
"SP.POP.DPND" : "Dependency_Ratio", | |
"SP.RUR.TOTL.ZS" : "Rural_Population", | |
"SH.XPD.PUBL.ZS" : "Health_Expenditure", | |
"SE.TER.ENRR" : "Enrollment_Ter", | |
"SE.SEC.ENRR" : "Enrollment_Sec", | |
"SE.PRM.ENRR" : "Enrollment_Pri", | |
"SL.UEM.LTRM.ZS" : "Unemployment", | |
"SE.XPD.TOTL.GD.ZS" : "Education_Expenditure", | |
"SH.STA.ACSN" : "Sanitation_Facilities" | |
} | |
countries = ["ARG", "AUS", "BRA", "CAN", "CHN", "DEU", "FRA", "IND", | |
"IDN", "ITA", "JPN", "MEX", "RUS", "SAU", "KOR", "GBR", | |
"TUR", "USA", "ZAF"] | |
myWB = wbdata.get_dataframe(indicators, country=countries, | |
convert_date=False, data_date=ndate) | |
df = pd.DataFrame(myWB) | |
table = 'Poverty_Predictions' | |
mysql_host = "localhost" | |
mysql_user = "root" | |
mysql_pass = "jimmyjimmy" | |
mysql_db = "WBjimmy" | |
# To save a dataframe to CSV | |
df.to_csv('C:/Users/Ye Yuan/Desktop' + '/' + table + '.csv') | |
# To connect MySQL Database | |
dbConnect = myDB.connect(host = mysql_host, | |
user = mysql_user, | |
passwd = mysql_pass, | |
db = mysql_db) | |
# To save DataFrame to MySQL database | |
dfClean = df.where((pd.notnull(df)), None) | |
dfClean.to_sql(con = dbConnect, | |
name = table, | |
if_exists='replace', | |
flavor='mysql') | |
df = pd.read_sql("SELECT * from Poverty_Predictions", dbConnect) | |
# Plotting with Python using matplotlib | |
indicator1 = {"SI.POV.5DAY":"PHR5"} | |
indicator2 = {"SI.POV.GAP5":"PG5"} | |
countries1 = ["ARG", "BRA", "RUS", "TUR"] | |
#grab indicators above for countires above and load into data frame | |
df1 = wbdata.get_dataframe(indicator1, country=countries1, | |
convert_date=False, data_date=ndate) | |
df2 = wbdata.get_dataframe(indicator2, country=countries1, | |
convert_date=False, data_date=ndate) | |
#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable | |
dfu1 = df1.unstack(level=0) | |
dfu2 = df2.unstack(level=0) | |
# Plot of Poverty headcount ratio at $5 a day | |
dfu1.plot() | |
plt.legend(loc='best') | |
plt.title("Poverty headcount ratio at $5 a day (PPP) (% of population)") | |
plt.xlabel('Date'); plt.ylabel('Poverty headcount ratio at $5 a day %') | |
dfu2.plot() | |
plt.legend(loc='best') | |
plt.title("Poverty Gap at $5 a day (PPP) (% of population)") | |
plt.xlabel('Date'); plt.ylabel('Poverty Gap at $5 a day %') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('DBI') | |
library('RMySQL') | |
library('ggplot2') | |
#Connecting to MySQL Database | |
con <- dbConnect(MySQL(), | |
user="root", | |
password="jimmyjimmy", | |
dbname="wbjimmy", | |
host="localhost") | |
#Show "Poverty_Predictions" dataset and List fields | |
dbListTables(con) | |
dbListFields(con, "Poverty_Predictions") | |
#Import the dataset "Poverty_Predictions" | |
dataset <- dbReadTable(con, "Poverty_Predictions") | |
#Export dataset from MySQL database | |
write.csv(dataset, file = "Poverty_Predictions.csv") | |
#Import and Load dataset exported | |
data <- read.table("C:/Users/Ye Yuan/Documents/Poverty_Predictions.csv",header=TRUE, sep=",") | |
#Subsetting datasets, have tried regression on the subset but no ideal results | |
subset <- subset(data, (data$country=="Argentina"| | |
data$country=="Brazil"| | |
data$country=="Russian Federation"| | |
data$country=="Turkey")) | |
# Fitting the model | |
# Mutiple Regression on Poverty_Headcount Ratio by predictors | |
# by all predictors | |
fitTotalAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio | |
+ data$Unemployment + data$Health_Expenditure | |
+ data$Rural_Population + data$Education_Expenditure | |
+ data$Sanitation_Facilities + data$Enrollment_Pri | |
+ data$Enrollment_Sec + data$Enrollment_Ter, data=data) | |
summary(fitTotalAll) | |
# by all predictors with private attribute | |
fitPrivateAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio | |
+ data$Unemployment + data$Enrollment_Pri | |
+ data$Enrollment_Sec + data$Enrollment_Ter, data=data) | |
summary(fitPrivateAll) | |
# by all predictors with public attribute | |
fitPublicAll <- lm(data$Poverty_Headcount ~ data$Health_Expenditure | |
+ data$Rural_Population + data$Education_Expenditure | |
+ data$Sanitation_Facilities, data=data) | |
summary(fitPublicAll) | |
#GGPLOTS | |
d0 <- ggplot(data=data,aes_string(x='country',y='Poverty_Headcount')) | |
d0 + geom_boxplot() | |
d1 <- ggplot(data, aes(x=data$Dependency_Ratio, y=data$Poverty_Headcount)) | |
d1 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line | |
d2 <- ggplot(data, aes(x=data$Health_Expenditure, y=data$Poverty_Headcount)) | |
d2 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line | |
d3 <- ggplot(data, aes(x=data$Rural_Population, y=data$Poverty_Headcount)) | |
d3 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line | |
d4 <- ggplot(data, aes(x=data$Enrollment_Pri, y=data$Poverty_Headcount)) | |
d4 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line | |
d5 <- ggplot(data, aes(x=data$Enrollment_Sec, y=data$Poverty_Headcount)) | |
d5 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line | |
d6 <- ggplot(data, aes(x=data$Enrollment_Ter, y=data$Poverty_Headcount)) | |
d6 + geom_point(shape=1) + # Use hollow circles | |
geom_smooth(method=lm) # Add linear regression line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment