yyuan123/ -Python file

## -Python file
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 08 22:57:38 2014

@author: Ye Yuan
"""

import wbdata
import pandas as pd
import MySQLdb as myDB
import matplotlib.pyplot as plt

# Begin with an overview of available sources
# wbdata.get_source()
# Target on one source to get IDs
# wbdata.get_indicator(source=2)
# wbdata.get_indicator(source=16)

# Search for indicators
# wbdata.search_indicators("Working age")
# Search for G20 countriy codes
# wbdata.get_country()


# Restrict countries and time period and select indicators
from datetime import datetime as dt
ndate = (dt(2012,01,01), dt(2003,01,01))

indicators = {
            "SI.POV.5DAY" : "Poverty_Headcount",
            "SI.POV.GAP5" : "Poverty_gap",
            "SP.POP.DPND" : "Dependency_Ratio",
            "SP.RUR.TOTL.ZS" : "Rural_Population",
            "SH.XPD.PUBL.ZS" : "Health_Expenditure",
            "SE.TER.ENRR" : "Enrollment_Ter",
            "SE.SEC.ENRR" : "Enrollment_Sec",
            "SE.PRM.ENRR" : "Enrollment_Pri",
            "SL.UEM.LTRM.ZS" : "Unemployment",
            "SE.XPD.TOTL.GD.ZS" : "Education_Expenditure",
            "SH.STA.ACSN" : "Sanitation_Facilities"
            }

countries = ["ARG", "AUS", "BRA", "CAN", "CHN", "DEU", "FRA", "IND",
             "IDN", "ITA", "JPN", "MEX", "RUS", "SAU", "KOR", "GBR",
             "TUR", "USA", "ZAF"]

myWB = wbdata.get_dataframe(indicators, country=countries,
                            convert_date=False, data_date=ndate)
df = pd.DataFrame(myWB)

table = 'Poverty_Predictions'

mysql_host = "localhost"
mysql_user = "root"
mysql_pass = "jimmyjimmy"
mysql_db = "WBjimmy"

# To save a dataframe to CSV
df.to_csv('C:/Users/Ye Yuan/Desktop' + '/' + table + '.csv')

# To connect MySQL Database
dbConnect = myDB.connect(host = mysql_host,
                        user = mysql_user,
                        passwd = mysql_pass,
                        db = mysql_db)

# To save DataFrame to MySQL database
dfClean = df.where((pd.notnull(df)), None)
dfClean.to_sql(con = dbConnect,
               name = table,
               if_exists='replace',
               flavor='mysql')

df = pd.read_sql("SELECT * from Poverty_Predictions", dbConnect)


# Plotting with Python using matplotlib
indicator1 = {"SI.POV.5DAY":"PHR5"}
indicator2 = {"SI.POV.GAP5":"PG5"}
countries1 = ["ARG", "BRA", "RUS", "TUR"]

#grab indicators above for countires above and load into data frame
df1 = wbdata.get_dataframe(indicator1, country=countries1,
                          convert_date=False, data_date=ndate)

df2 = wbdata.get_dataframe(indicator2, country=countries1,
                          convert_date=False, data_date=ndate)
#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable
dfu1 = df1.unstack(level=0)
dfu2 = df2.unstack(level=0)

# Plot of Poverty headcount ratio at $5 a day
dfu1.plot()
plt.legend(loc='best')
plt.title("Poverty headcount ratio at $5 a day (PPP) (% of population)")
plt.xlabel('Date'); plt.ylabel('Poverty headcount ratio at $5 a day %')

dfu2.plot()
plt.legend(loc='best')
plt.title("Poverty Gap at $5 a day (PPP) (% of population)")
plt.xlabel('Date'); plt.ylabel('Poverty Gap at $5 a day %')

## -R file
library('DBI')
library('RMySQL')
library('ggplot2')

#Connecting to MySQL Database
con <- dbConnect(MySQL(),
                 user="root",
                 password="jimmyjimmy",
                 dbname="wbjimmy",
                 host="localhost")

#Show "Poverty_Predictions" dataset and List fields
dbListTables(con)
dbListFields(con, "Poverty_Predictions")

#Import the dataset "Poverty_Predictions"
dataset <- dbReadTable(con, "Poverty_Predictions")

#Export dataset from MySQL database
write.csv(dataset, file = "Poverty_Predictions.csv")

#Import and Load dataset exported
data <- read.table("C:/Users/Ye Yuan/Documents/Poverty_Predictions.csv",header=TRUE, sep=",")

#Subsetting datasets, have tried regression on the subset but no ideal results
subset <- subset(data, (data$country=="Argentina"|
                              data$country=="Brazil"|
                              data$country=="Russian Federation"|
                              data$country=="Turkey"))

# Fitting the model
# Mutiple Regression on Poverty_Headcount Ratio by predictors

# by all predictors
fitTotalAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
               + data$Unemployment + data$Health_Expenditure
               + data$Rural_Population + data$Education_Expenditure
               + data$Sanitation_Facilities + data$Enrollment_Pri
               + data$Enrollment_Sec + data$Enrollment_Ter, data=data)

summary(fitTotalAll)

# by all predictors with private attribute
fitPrivateAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
                  + data$Unemployment + data$Enrollment_Pri
                  + data$Enrollment_Sec + data$Enrollment_Ter, data=data)

summary(fitPrivateAll)

# by all predictors with public attribute
fitPublicAll <- lm(data$Poverty_Headcount ~ data$Health_Expenditure
                   + data$Rural_Population + data$Education_Expenditure
                   + data$Sanitation_Facilities, data=data)

summary(fitPublicAll)


#GGPLOTS

d0 <- ggplot(data=data,aes_string(x='country',y='Poverty_Headcount'))
d0 + geom_boxplot()

d1 <- ggplot(data, aes(x=data$Dependency_Ratio, y=data$Poverty_Headcount))
d1 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line

d2 <- ggplot(data, aes(x=data$Health_Expenditure, y=data$Poverty_Headcount))
d2 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line

d3 <- ggplot(data, aes(x=data$Rural_Population, y=data$Poverty_Headcount))
d3 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line

d4 <- ggplot(data, aes(x=data$Enrollment_Pri, y=data$Poverty_Headcount))
d4 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line

d5 <- ggplot(data, aes(x=data$Enrollment_Sec, y=data$Poverty_Headcount))
d5 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line

d6 <- ggplot(data, aes(x=data$Enrollment_Ter, y=data$Poverty_Headcount))
d6 + geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line
	# -- coding: utf-8 --
	"""
	Created on Mon Dec 08 22:57:38 2014

	@author: Ye Yuan
	"""

	import wbdata
	import pandas as pd
	import MySQLdb as myDB
	import matplotlib.pyplot as plt

	# Begin with an overview of available sources
	# wbdata.get_source()
	# Target on one source to get IDs
	# wbdata.get_indicator(source=2)
	# wbdata.get_indicator(source=16)

	# Search for indicators
	# wbdata.search_indicators("Working age")
	# Search for G20 countriy codes
	# wbdata.get_country()


	# Restrict countries and time period and select indicators
	from datetime import datetime as dt
	ndate = (dt(2012,01,01), dt(2003,01,01))

	indicators = {
	"SI.POV.5DAY" : "Poverty_Headcount",
	"SI.POV.GAP5" : "Poverty_gap",
	"SP.POP.DPND" : "Dependency_Ratio",
	"SP.RUR.TOTL.ZS" : "Rural_Population",
	"SH.XPD.PUBL.ZS" : "Health_Expenditure",
	"SE.TER.ENRR" : "Enrollment_Ter",
	"SE.SEC.ENRR" : "Enrollment_Sec",
	"SE.PRM.ENRR" : "Enrollment_Pri",
	"SL.UEM.LTRM.ZS" : "Unemployment",
	"SE.XPD.TOTL.GD.ZS" : "Education_Expenditure",
	"SH.STA.ACSN" : "Sanitation_Facilities"
	}

	countries = ["ARG", "AUS", "BRA", "CAN", "CHN", "DEU", "FRA", "IND",
	"IDN", "ITA", "JPN", "MEX", "RUS", "SAU", "KOR", "GBR",
	"TUR", "USA", "ZAF"]

	myWB = wbdata.get_dataframe(indicators, country=countries,
	convert_date=False, data_date=ndate)
	df = pd.DataFrame(myWB)

	table = 'Poverty_Predictions'

	mysql_host = "localhost"
	mysql_user = "root"
	mysql_pass = "jimmyjimmy"
	mysql_db = "WBjimmy"

	# To save a dataframe to CSV
	df.to_csv('C:/Users/Ye Yuan/Desktop' + '/' + table + '.csv')

	# To connect MySQL Database
	dbConnect = myDB.connect(host = mysql_host,
	user = mysql_user,
	passwd = mysql_pass,
	db = mysql_db)

	# To save DataFrame to MySQL database
	dfClean = df.where((pd.notnull(df)), None)
	dfClean.to_sql(con = dbConnect,
	name = table,
	if_exists='replace',
	flavor='mysql')

	df = pd.read_sql("SELECT * from Poverty_Predictions", dbConnect)


	# Plotting with Python using matplotlib
	indicator1 = {"SI.POV.5DAY":"PHR5"}
	indicator2 = {"SI.POV.GAP5":"PG5"}
	countries1 = ["ARG", "BRA", "RUS", "TUR"]

	#grab indicators above for countires above and load into data frame
	df1 = wbdata.get_dataframe(indicator1, country=countries1,
	convert_date=False, data_date=ndate)

	df2 = wbdata.get_dataframe(indicator2, country=countries1,
	convert_date=False, data_date=ndate)
	#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable
	dfu1 = df1.unstack(level=0)
	dfu2 = df2.unstack(level=0)

	# Plot of Poverty headcount ratio at $5 a day
	dfu1.plot()
	plt.legend(loc='best')
	plt.title("Poverty headcount ratio at $5 a day (PPP) (% of population)")
	plt.xlabel('Date'); plt.ylabel('Poverty headcount ratio at $5 a day %')

	dfu2.plot()
	plt.legend(loc='best')
	plt.title("Poverty Gap at $5 a day (PPP) (% of population)")
	plt.xlabel('Date'); plt.ylabel('Poverty Gap at $5 a day %')
	library('DBI')
	library('RMySQL')
	library('ggplot2')

	#Connecting to MySQL Database
	con <- dbConnect(MySQL(),
	user="root",
	password="jimmyjimmy",
	dbname="wbjimmy",
	host="localhost")

	#Show "Poverty_Predictions" dataset and List fields
	dbListTables(con)
	dbListFields(con, "Poverty_Predictions")

	#Import the dataset "Poverty_Predictions"
	dataset <- dbReadTable(con, "Poverty_Predictions")

	#Export dataset from MySQL database
	write.csv(dataset, file = "Poverty_Predictions.csv")

	#Import and Load dataset exported
	data <- read.table("C:/Users/Ye Yuan/Documents/Poverty_Predictions.csv",header=TRUE, sep=",")

	#Subsetting datasets, have tried regression on the subset but no ideal results
	subset <- subset(data, (data$country=="Argentina"\|
	data$country=="Brazil"\|
	data$country=="Russian Federation"\|
	data$country=="Turkey"))

	# Fitting the model
	# Mutiple Regression on Poverty_Headcount Ratio by predictors

	# by all predictors
	fitTotalAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
	+ data$Unemployment + data$Health_Expenditure
	+ data$Rural_Population + data$Education_Expenditure
	+ data$Sanitation_Facilities + data$Enrollment_Pri
	+ data$Enrollment_Sec + data$Enrollment_Ter, data=data)

	summary(fitTotalAll)

	# by all predictors with private attribute
	fitPrivateAll <- lm(data$Poverty_Headcount ~ data$Dependency_Ratio
	+ data$Unemployment + data$Enrollment_Pri
	+ data$Enrollment_Sec + data$Enrollment_Ter, data=data)

	summary(fitPrivateAll)

	# by all predictors with public attribute
	fitPublicAll <- lm(data$Poverty_Headcount ~ data$Health_Expenditure
	+ data$Rural_Population + data$Education_Expenditure
	+ data$Sanitation_Facilities, data=data)

	summary(fitPublicAll)


	#GGPLOTS

	d0 <- ggplot(data=data,aes_string(x='country',y='Poverty_Headcount'))
	d0 + geom_boxplot()

	d1 <- ggplot(data, aes(x=data$Dependency_Ratio, y=data$Poverty_Headcount))
	d1 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line

	d2 <- ggplot(data, aes(x=data$Health_Expenditure, y=data$Poverty_Headcount))
	d2 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line

	d3 <- ggplot(data, aes(x=data$Rural_Population, y=data$Poverty_Headcount))
	d3 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line

	d4 <- ggplot(data, aes(x=data$Enrollment_Pri, y=data$Poverty_Headcount))
	d4 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line

	d5 <- ggplot(data, aes(x=data$Enrollment_Sec, y=data$Poverty_Headcount))
	d5 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line

	d6 <- ggplot(data, aes(x=data$Enrollment_Ter, y=data$Poverty_Headcount))
	d6 + geom_point(shape=1) + # Use hollow circles
	geom_smooth(method=lm) # Add linear regression line