jmsword/education.py

## education.py
from bs4 import BeautifulSoup
import requests
import pandas as pd
import sqlite3 as lite
import csv
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

#Where the data is on the web
url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm'
r = requests.get(url)

soup = BeautifulSoup(r.content, "lxml")

#Specific table we want to target
table = soup('table')[6]

#Narrowing down the table to just the rows we want
raw_data = []
rows = table.find_all('tr')[8:191]
for row in rows:
	cols = row.find_all('td')
	cols = [ele.text.strip() for ele in cols]
	raw_data.append([ele for ele in cols])

#Data we need is in indicies: 0,1,7,10
data_set = []
for i in raw_data:
	country = str(i[0])
	year = str(i[1])
	men = int(i[7])
	women = int(i[10])
	total = int(i[4])
	clean_data = [country, year, men, women, total]
	data_set.append(clean_data)

#Create dataframe to put HTML data into
df_columns = ['Country', 'Year', 'Men', 'Women', 'Total']
df = pd.DataFrame(data_set, index=None, columns=df_columns)

con = lite.connect('gdp.db')
cur = con.cursor()

with con:
    cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )')

with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile:
	next(inputFile)
	next(inputFile)
	next(inputFile)
	next(inputFile)
	header = next(inputFile)
	inputReader = csv.reader(inputFile)
	for line in inputReader:
		with con:
			cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");')

#Get gdp table as pandas dataframe
with con:
	cur.execute('SELECT * FROM gdp')
	rows = cur.fetchall()
	cols = [desc[0] for desc in cur.description]
	gdp = pd.DataFrame(rows, columns=cols)

#Replace blank values with 'NaN'
gdp.replace('', np.nan, inplace=True)

#Merge gdp and educational attainment dataframes on country name
#Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look
#through the list manually and change the names to match up, but for the purpose of this assignment I am just
#going to work with the countries that matched to save time.
combined = pd.merge(df, gdp, on='Country', how='inner')

#Drop rows with 'NaN' values
combined.dropna(inplace=True)

#Rename columns to match year education attainment and gdb values
#rewrite at some point to a function to remove '_' from column name instead
#of doing it manually
combined.rename(columns={'_1999': '1999',
						 '_2000': '2000',
						 '_2001': '2001',
						 '_2002': '2002',
						 '_2003': '2003',
						 '_2004': '2004',
						 '_2005': '2005',
						 '_2006': '2006',
						 '_2007': '2007',
						 '_2008': '2008',
						 '_2009': '2009',
						 '_2010': '2010',
						}, inplace=True)


#Log transformation on gdp numbers
for col in combined.columns[5:17]:
	combined[col] = np.log(combined[col])

#Match gdp numbers with corresponding education year in a new column
for row in combined.index:
    for col in combined.columns:
        if np.all(combined.ix[row]['Year'] == col):
            combined.ix[row,'log_gdp'] = combined.ix[row,col]

#ANALYSIS

total_edu = combined['Total']
gdp = combined['log_gdp']

x = np.matrix(total_edu).transpose()
y = np.matrix(gdp).transpose()

X = sm.add_constant(x)
model = sm.OLS(y,X)
f = model.fit()

print ('P-Values: ', f.pvalues)
print ('R-Squared: ', f.rsquared)

print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \
	"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \
	"is becoming less and less important to a country's productivity.")
	from bs4 import BeautifulSoup
	import requests
	import pandas as pd
	import sqlite3 as lite
	import csv
	import numpy as np
	import matplotlib.pyplot as plt
	import numpy as np
	import statsmodels.api as sm

	#Where the data is on the web
	url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm'
	r = requests.get(url)

	soup = BeautifulSoup(r.content, "lxml")

	#Specific table we want to target
	table = soup('table')[6]

	#Narrowing down the table to just the rows we want
	raw_data = []
	rows = table.find_all('tr')[8:191]
	for row in rows:
	cols = row.find_all('td')
	cols = [ele.text.strip() for ele in cols]
	raw_data.append([ele for ele in cols])

	#Data we need is in indicies: 0,1,7,10
	data_set = []
	for i in raw_data:
	country = str(i[0])
	year = str(i[1])
	men = int(i[7])
	women = int(i[10])
	total = int(i[4])
	clean_data = [country, year, men, women, total]
	data_set.append(clean_data)

	#Create dataframe to put HTML data into
	df_columns = ['Country', 'Year', 'Men', 'Women', 'Total']
	df = pd.DataFrame(data_set, index=None, columns=df_columns)

	con = lite.connect('gdp.db')
	cur = con.cursor()

	with con:
	cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )')

	with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile:
	next(inputFile)
	next(inputFile)
	next(inputFile)
	next(inputFile)
	header = next(inputFile)
	inputReader = csv.reader(inputFile)
	for line in inputReader:
	with con:
	cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");')

	#Get gdp table as pandas dataframe
	with con:
	cur.execute('SELECT * FROM gdp')
	rows = cur.fetchall()
	cols = [desc[0] for desc in cur.description]
	gdp = pd.DataFrame(rows, columns=cols)

	#Replace blank values with 'NaN'
	gdp.replace('', np.nan, inplace=True)

	#Merge gdp and educational attainment dataframes on country name
	#Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look
	#through the list manually and change the names to match up, but for the purpose of this assignment I am just
	#going to work with the countries that matched to save time.
	combined = pd.merge(df, gdp, on='Country', how='inner')

	#Drop rows with 'NaN' values
	combined.dropna(inplace=True)

	#Rename columns to match year education attainment and gdb values
	#rewrite at some point to a function to remove '_' from column name instead
	#of doing it manually
	combined.rename(columns={'_1999': '1999',
	'_2000': '2000',
	'_2001': '2001',
	'_2002': '2002',
	'_2003': '2003',
	'_2004': '2004',
	'_2005': '2005',
	'_2006': '2006',
	'_2007': '2007',
	'_2008': '2008',
	'_2009': '2009',
	'_2010': '2010',
	}, inplace=True)


	#Log transformation on gdp numbers
	for col in combined.columns[5:17]:
	combined[col] = np.log(combined[col])

	#Match gdp numbers with corresponding education year in a new column
	for row in combined.index:
	for col in combined.columns:
	if np.all(combined.ix[row]['Year'] == col):
	combined.ix[row,'log_gdp'] = combined.ix[row,col]

	#ANALYSIS

	total_edu = combined['Total']
	gdp = combined['log_gdp']

	x = np.matrix(total_edu).transpose()
	y = np.matrix(gdp).transpose()

	X = sm.add_constant(x)
	model = sm.OLS(y,X)
	f = model.fit()

	print ('P-Values: ', f.pvalues)
	print ('R-Squared: ', f.rsquared)

	print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \
	"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \
	"is becoming less and less important to a country's productivity.")