bbzzzz/Working with World Bank API

## ggplot of World Bank Data
#### Author: Bohan Zhang | The Business Analytics Program of the George Washington University
#### R part

library("RMySQL")
library("ggplot2")

con <- dbConnect(MySQL(), user="root", password="",
                  dbname="DB", host="localhost")

mydata <- dbReadTable(con, "mydata")

# Bubble Chart for Expenditure on Education, Fertility Rate and GDP per Capita
ggplot(mydata, aes(x=Expenditure_on_Education, y=Fertility_Rate, size=GDP, label=Country))+
  geom_point(colour="white", fill="red", shape=21)+
  scale_size_area(breaks=c(250, 500, 1000, 10000, 40000), "GDP per capita\n(constant 2005 US$)",max_size = 25)+
  stat_smooth(method="lm", size=0.5, colour="black", alpha=0.4, level=0.95)+
  scale_x_continuous(name="Public spending on education, total (% of GDP)", limits=c(0,8))+
  scale_y_continuous(name="Fertility rate, total (births per woman)", limits=c(1,7))+
  geom_text(size=4)+
  ggtitle("Fertility Rate v.s. Expenditure on Education") +
  theme_bw()

# Bar Chart for Fertility rate and Region, sorted by Fertility rate
ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Region)) + geom_bar(stat="identity")+
  scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Region") +
  theme(title = element_text(size=rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1))

# Bar chart for Fertility rate and income level, sorted by Fertility rate
ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Income_Level)) + geom_bar(stat="identity")+
  scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Income Level") +
  theme(title = element_text(size = rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1))

## Working with World Bank API
# -*- coding: utf-8 -*-
#### Author: Bohan Zhang | The Business Analytics Program of the George Washington University
#### Python Part

import wbdata
import pandas as pd
import datetime
import MySQLdb as myDB

#### test if data for certain indicator, country and year is available
def test_year(ind, ctry, year):
    import wbdata
    data = []
    #### limit time frame to one year as the study only need one year data
    data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
    data = wbdata.get_data(ind, ctry, data_date)
    for c in data:
        if c['value'] == None:
            print 'Country:', c['country']['value']
            print 'Data:', c['indicator']['value']
            print 'Year:', c['date']
            print 'Staus:', 'NOT available'
        else:
            print 'Country:', c['country']['value']
            print 'Data:', c['indicator']['value']
            print 'Year:', c['date']
            print 'Staus:', 'Available'

#### extract data from returned dictionary
def get_value(ind, ctry, year):
    raw_data = []
    data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
    raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date)

    data = []
    for country in raw_data:
        data.append(country['value'])
    return data

#### extract country names from returned dictionary
def get_countryList(ind, ctry, year):
    raw_data = []
    data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
    raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date)

    data = []
    for country in raw_data:
        data.append(country['country']['value'])
    return data

countries=["AGO","BEN","GMB","ZMB","SAU","IND","EGY","IRN","USA",
           "GBR","FRA","MLI","AUS","COL","NPL","BTN","ALB","TGO",
           "SEN","TZA","NZL","PER","LBN","PAK","PHL","IDN","THA",
           "OMN","ISR","BRA","CHL","GIN","SLE"]

#### convert string type to float type for numeric data
GDP = [float(x) for x in get_value("NY.GDP.PCAP.KD",countries,2005)]
Exp_Edu = [float(x) for x in get_value("SE.XPD.TOTL.GD.ZS",countries,2005)]
Fer_Rate = [float(x) for x in get_value("SP.DYN.TFRT.IN",countries,2005)]
Country = get_countryList("NY.GDP.PCAP.KD",countries,2005)

#### get region information for given country
def get_region(countries):
    regions = []
    all = wbdata.get_country(countries)
    for country in all:
        region = country['region']['value']
        #### cut residual information
        flag = region.find('(')
        if flag>0:
            region = region[0:flag-1]
        regions.append(region)
    return regions

#### get income level information for given country
def get_incomeLevel(countries):
    incomes = []
    all = wbdata.get_country(countries)
    for country in all:
        income = country['incomeLevel']['value']
        #### cut residual information
        flag = income.find(':')
        if flag>0:
            income = income[0:flag]
        incomes.append(income)
    return incomes

Region = get_region(countries)
Income_Level = get_incomeLevel(countries)

#### create a dictionary for data frame
myDic = { 'Country':  Country,
          'GDP':      GDP,
          'Expenditure on Education': Exp_Edu,
          'Fertility Rate': Fer_Rate,
          'Income Level': Income_Level,
          'Region': Region
        }

#### convert dictionary to data frame
myDF = pd.DataFrame(myDic)

#### create connection to MySQL
conn = myDB.connect('localhost','root')
cursor = conn.cursor()

sql = ' SHOW DATABASES; '
cursor.execute(sql)

#### test if the database name is used before, drop if used
sql = ' DROP DATABASE IF EXISTS DB; '
cursor.execute(sql)

sql = ' CREATE DATABASE DB; '
cursor.execute(sql)

#### create connection to the database created in last step
mydb = myDB.connect(host='localhost', user='root',passwd='', db='DB')

#### upload data frame to MySQL
myDF.to_sql(con = mydb, name = 'mydata', if_exists = 'replace', flavor = 'mysql')
	#### Author: Bohan Zhang \| The Business Analytics Program of the George Washington University
	#### R part

	library("RMySQL")
	library("ggplot2")

	con <- dbConnect(MySQL(), user="root", password="",
	dbname="DB", host="localhost")

	mydata <- dbReadTable(con, "mydata")

	# Bubble Chart for Expenditure on Education, Fertility Rate and GDP per Capita
	ggplot(mydata, aes(x=Expenditure_on_Education, y=Fertility_Rate, size=GDP, label=Country))+
	geom_point(colour="white", fill="red", shape=21)+
	scale_size_area(breaks=c(250, 500, 1000, 10000, 40000), "GDP per capita\n(constant 2005 US$)",max_size = 25)+
	stat_smooth(method="lm", size=0.5, colour="black", alpha=0.4, level=0.95)+
	scale_x_continuous(name="Public spending on education, total (% of GDP)", limits=c(0,8))+
	scale_y_continuous(name="Fertility rate, total (births per woman)", limits=c(1,7))+
	geom_text(size=4)+
	ggtitle("Fertility Rate v.s. Expenditure on Education") +
	theme_bw()

	# Bar Chart for Fertility rate and Region, sorted by Fertility rate
	ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Region)) + geom_bar(stat="identity")+
	scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Region") +
	theme(title = element_text(size=rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1))

	# Bar chart for Fertility rate and income level, sorted by Fertility rate
	ggplot(data=mydata, aes(reorder(factor(Country),-Fertility_Rate),y=Fertility_Rate, fill=Income_Level)) + geom_bar(stat="identity")+
	scale_x_discrete(name='Country') + ggtitle("Fertility Rate by Income Level") +
	theme(title = element_text(size = rel(1.2)), axis.text.x = element_text(size = rel(1.8), angle = 90, hjust = 1))
	# -- coding: utf-8 --
	#### Author: Bohan Zhang \| The Business Analytics Program of the George Washington University
	#### Python Part

	import wbdata
	import pandas as pd
	import datetime
	import MySQLdb as myDB

	#### test if data for certain indicator, country and year is available
	def test_year(ind, ctry, year):
	import wbdata
	data = []
	#### limit time frame to one year as the study only need one year data
	data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
	data = wbdata.get_data(ind, ctry, data_date)
	for c in data:
	if c['value'] == None:
	print 'Country:', c['country']['value']
	print 'Data:', c['indicator']['value']
	print 'Year:', c['date']
	print 'Staus:', 'NOT available'
	else:
	print 'Country:', c['country']['value']
	print 'Data:', c['indicator']['value']
	print 'Year:', c['date']
	print 'Staus:', 'Available'

	#### extract data from returned dictionary
	def get_value(ind, ctry, year):
	raw_data = []
	data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
	raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date)

	data = []
	for country in raw_data:
	data.append(country['value'])
	return data

	#### extract country names from returned dictionary
	def get_countryList(ind, ctry, year):
	raw_data = []
	data_date = (datetime.datetime(year, 1, 1), datetime.datetime(year, 1, 1))
	raw_data = wbdata.get_data(ind, country = ctry, data_date=data_date)

	data = []
	for country in raw_data:
	data.append(country['country']['value'])
	return data

	countries=["AGO","BEN","GMB","ZMB","SAU","IND","EGY","IRN","USA",
	"GBR","FRA","MLI","AUS","COL","NPL","BTN","ALB","TGO",
	"SEN","TZA","NZL","PER","LBN","PAK","PHL","IDN","THA",
	"OMN","ISR","BRA","CHL","GIN","SLE"]

	#### convert string type to float type for numeric data
	GDP = [float(x) for x in get_value("NY.GDP.PCAP.KD",countries,2005)]
	Exp_Edu = [float(x) for x in get_value("SE.XPD.TOTL.GD.ZS",countries,2005)]
	Fer_Rate = [float(x) for x in get_value("SP.DYN.TFRT.IN",countries,2005)]
	Country = get_countryList("NY.GDP.PCAP.KD",countries,2005)

	#### get region information for given country
	def get_region(countries):
	regions = []
	all = wbdata.get_country(countries)
	for country in all:
	region = country['region']['value']
	#### cut residual information
	flag = region.find('(')
	if flag>0:
	region = region[0:flag-1]
	regions.append(region)
	return regions

	#### get income level information for given country
	def get_incomeLevel(countries):
	incomes = []
	all = wbdata.get_country(countries)
	for country in all:
	income = country['incomeLevel']['value']
	#### cut residual information
	flag = income.find(':')
	if flag>0:
	income = income[0:flag]
	incomes.append(income)
	return incomes

	Region = get_region(countries)
	Income_Level = get_incomeLevel(countries)

	#### create a dictionary for data frame
	myDic = { 'Country': Country,
	'GDP': GDP,
	'Expenditure on Education': Exp_Edu,
	'Fertility Rate': Fer_Rate,
	'Income Level': Income_Level,
	'Region': Region
	}

	#### convert dictionary to data frame
	myDF = pd.DataFrame(myDic)

	#### create connection to MySQL
	conn = myDB.connect('localhost','root')
	cursor = conn.cursor()

	sql = ' SHOW DATABASES; '
	cursor.execute(sql)

	#### test if the database name is used before, drop if used
	sql = ' DROP DATABASE IF EXISTS DB; '
	cursor.execute(sql)

	sql = ' CREATE DATABASE DB; '
	cursor.execute(sql)

	#### create connection to the database created in last step
	mydb = myDB.connect(host='localhost', user='root',passwd='', db='DB')

	#### upload data frame to MySQL
	myDF.to_sql(con = mydb, name = 'mydata', if_exists = 'replace', flavor = 'mysql')