Skip to content

Instantly share code, notes, and snippets.

@jmsword
Created January 28, 2017 17:56
Show Gist options
  • Save jmsword/f198672f2755abbde83212cfb3d63f09 to your computer and use it in GitHub Desktop.
Save jmsword/f198672f2755abbde83212cfb3d63f09 to your computer and use it in GitHub Desktop.
Education
from bs4 import BeautifulSoup
import requests
import pandas as pd
import sqlite3 as lite
import csv
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
#Where the data is on the web
url = 'http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
#Specific table we want to target
table = soup('table')[6]
#Narrowing down the table to just the rows we want
raw_data = []
rows = table.find_all('tr')[8:191]
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
raw_data.append([ele for ele in cols])
#Data we need is in indicies: 0,1,7,10
data_set = []
for i in raw_data:
country = str(i[0])
year = str(i[1])
men = int(i[7])
women = int(i[10])
total = int(i[4])
clean_data = [country, year, men, women, total]
data_set.append(clean_data)
#Create dataframe to put HTML data into
df_columns = ['Country', 'Year', 'Men', 'Women', 'Total']
df = pd.DataFrame(data_set, index=None, columns=df_columns)
con = lite.connect('gdp.db')
cur = con.cursor()
with con:
cur.execute('CREATE TABLE gdp (Country TEXT, _1999 REAL, _2000 REAL, _2001 REAL, _2002 REAL, _2003 REAL, _2004 REAL, _2005 REAL, _2006 REAL, _2007 REAL, _2008 REAL, _2009 REAL, _2010 REAL )')
with open('API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv') as inputFile:
next(inputFile)
next(inputFile)
next(inputFile)
next(inputFile)
header = next(inputFile)
inputReader = csv.reader(inputFile)
for line in inputReader:
with con:
cur.execute('INSERT INTO gdp (Country, _1999, _2000, _2001, _2002, _2003, _2004, _2005, _2006, _2007, _2008, _2009, _2010) VALUES ("' + line[0] + '","' + '","'.join(line[43:55]) + '");')
#Get gdp table as pandas dataframe
with con:
cur.execute('SELECT * FROM gdp')
rows = cur.fetchall()
cols = [desc[0] for desc in cur.description]
gdp = pd.DataFrame(rows, columns=cols)
#Replace blank values with 'NaN'
gdp.replace('', np.nan, inplace=True)
#Merge gdp and educational attainment dataframes on country name
#Several countries were dropped because the names didn't match up perfectly. In a perfect world I would look
#through the list manually and change the names to match up, but for the purpose of this assignment I am just
#going to work with the countries that matched to save time.
combined = pd.merge(df, gdp, on='Country', how='inner')
#Drop rows with 'NaN' values
combined.dropna(inplace=True)
#Rename columns to match year education attainment and gdb values
#rewrite at some point to a function to remove '_' from column name instead
#of doing it manually
combined.rename(columns={'_1999': '1999',
'_2000': '2000',
'_2001': '2001',
'_2002': '2002',
'_2003': '2003',
'_2004': '2004',
'_2005': '2005',
'_2006': '2006',
'_2007': '2007',
'_2008': '2008',
'_2009': '2009',
'_2010': '2010',
}, inplace=True)
#Log transformation on gdp numbers
for col in combined.columns[5:17]:
combined[col] = np.log(combined[col])
#Match gdp numbers with corresponding education year in a new column
for row in combined.index:
for col in combined.columns:
if np.all(combined.ix[row]['Year'] == col):
combined.ix[row,'log_gdp'] = combined.ix[row,col]
#ANALYSIS
total_edu = combined['Total']
gdp = combined['log_gdp']
x = np.matrix(total_edu).transpose()
y = np.matrix(gdp).transpose()
X = sm.add_constant(x)
model = sm.OLS(y,X)
f = model.fit()
print ('P-Values: ', f.pvalues)
print ('R-Squared: ', f.rsquared)
print("With an R-Squared value of: ", f.rsquared, ", there appears to be little correlation between school life expentancy and a country's GDP." \
"I believe this is because with increasing globalization and the movement of labor across boarders, local education attainment " \
"is becoming less and less important to a country's productivity.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment