Last active
August 29, 2015 14:11
-
-
Save bbzzzz/43e269102fe96d1e98db to your computer and use it in GitHub Desktop.
This work extracted open source data from World Bank focusing on the study about relationship between fertility rate, public expenditure on education and GDP. Data is stored in MySQL database for the convenience of inter-working between Python and R. Visualization is done with ggplot2 package in R.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### Author: Bohan Zhang | The Business Analytics Program of the George Washington University | |
#### R part | |
library("RMySQL") | |
library("ggplot2") | |
con <- dbConnect(MySQL(), user="root", password="", | |
dbname="DB", host="localhost") | |
mydata <- dbReadTable(con, "mydata") | |
# Bubble Chart for Expenditure on Education, Fertility Rate and GDP per Capita | |
ggplot(mydata, aes(x=Expenditure_on_Education, y=Fertility_Rate, size=GDP, label=Country))+ | |
geom_point(colour="white", fill="red", shape=21)+ | |
scale_size_area(breaks=c(250, 500, 1000, 10000, 40000), "GDP per capita\n(constant 2005 US$)",max_size = 25)+ | |
stat_smooth(method="lm", size=0.5, colour="black", alpha=0.4, level=0.95)+ | |
scale_x_continuous(name="Public spending on education, total (% of GDP)", limits=c(0,8))+ | |
scale_y_continuous(name="Fertility rate, total (births per woman)", limits=c(1,7))+ | |
geom_text(size=4)+ | |
ggtitle("Fertility Rate v.s. Expenditure on Education") + | |
theme_bw() | |
# Bar Chart for Fertility rate and Region, sorted by Fertility rate | |
ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Region)) + geom_bar(stat="identity")+ | |
scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Region") + | |
theme(title = element_text(size=rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1)) | |
# Bar chart for Fertility rate and income level, sorted by Fertility rate | |
ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Income_Level)) + geom_bar(stat="identity")+ | |
scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Income Level") + | |
theme(title = element_text(size = rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#### Author: Bohan Zhang | The Business Analytics Program of the George Washington University | |
#### Python Part | |
import wbdata | |
import pandas as pd | |
import datetime | |
import MySQLdb as myDB | |
#### test if data for certain indicator, country and year is available | |
def test_year(ind, ctry, year): | |
import wbdata | |
data = [] | |
#### limit time frame to one year as the study only need one year data | |
data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1)) | |
data = wbdata.get_data(ind, ctry, data_date) | |
for c in data: | |
if c['value'] == None: | |
print 'Country:', c['country']['value'] | |
print 'Data:', c['indicator']['value'] | |
print 'Year:', c['date'] | |
print 'Staus:', 'NOT available' | |
else: | |
print 'Country:', c['country']['value'] | |
print 'Data:', c['indicator']['value'] | |
print 'Year:', c['date'] | |
print 'Staus:', 'Available' | |
#### extract data from returned dictionary | |
def get_value(ind, ctry, year): | |
raw_data = [] | |
data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1)) | |
raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date) | |
data = [] | |
for country in raw_data: | |
data.append(country['value']) | |
return data | |
#### extract country names from returned dictionary | |
def get_countryList(ind, ctry, year): | |
raw_data = [] | |
data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1)) | |
raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date) | |
data = [] | |
for country in raw_data: | |
data.append(country['country']['value']) | |
return data | |
countries=["AGO","BEN","GMB","ZMB","SAU","IND","EGY","IRN","USA", | |
"GBR","FRA","MLI","AUS","COL","NPL","BTN","ALB","TGO", | |
"SEN","TZA","NZL","PER","LBN","PAK","PHL","IDN","THA", | |
"OMN","ISR","BRA","CHL","GIN","SLE"] | |
#### convert string type to float type for numeric data | |
GDP = [float(x) for x in get_value("NY.GDP.PCAP.KD",countries,2005)] | |
Exp_Edu = [float(x) for x in get_value("SE.XPD.TOTL.GD.ZS",countries,2005)] | |
Fer_Rate = [float(x) for x in get_value("SP.DYN.TFRT.IN",countries,2005)] | |
Country = get_countryList("NY.GDP.PCAP.KD",countries,2005) | |
#### get region information for given country | |
def get_region(countries): | |
regions = [] | |
all = wbdata.get_country(countries) | |
for country in all: | |
region = country['region']['value'] | |
#### cut residual information | |
flag = region.find('(') | |
if flag>0: | |
region = region[0:flag-1] | |
regions.append(region) | |
return regions | |
#### get income level information for given country | |
def get_incomeLevel(countries): | |
incomes = [] | |
all = wbdata.get_country(countries) | |
for country in all: | |
income = country['incomeLevel']['value'] | |
#### cut residual information | |
flag = income.find(':') | |
if flag>0: | |
income = income[0:flag] | |
incomes.append(income) | |
return incomes | |
Region = get_region(countries) | |
Income_Level = get_incomeLevel(countries) | |
#### create a dictionary for data frame | |
myDic = { 'Country': Country, | |
'GDP': GDP, | |
'Expenditure on Education': Exp_Edu, | |
'Fertility Rate': Fer_Rate, | |
'Income Level': Income_Level, | |
'Region': Region | |
} | |
#### convert dictionary to data frame | |
myDF = pd.DataFrame(myDic) | |
#### create connection to MySQL | |
conn = myDB.connect('localhost','root') | |
cursor = conn.cursor() | |
sql = ' SHOW DATABASES; ' | |
cursor.execute(sql) | |
#### test if the database name is used before, drop if used | |
sql = ' DROP DATABASE IF EXISTS DB; ' | |
cursor.execute(sql) | |
sql = ' CREATE DATABASE DB; ' | |
cursor.execute(sql) | |
#### create connection to the database created in last step | |
mydb = myDB.connect(host='localhost', user='root',passwd='', db='DB') | |
#### upload data frame to MySQL | |
myDF.to_sql(con = mydb, name = 'mydata', if_exists = 'replace', flavor = 'mysql') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment