Ajey Venkataraman ajey091

## corec5.py
fig, ax = plt.subplots(figsize=(10, 8))
raw_data['International?'] = raw_data['International?'].str.lower()
ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='International?',
                 showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3")
plt.show()

## corec4.py
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='Gender',
                 showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3")
plt.show()

## corec3.py
# We will drop a bunch of columns that are mostly missing values( >50%), or are redundant.
# There are a couple of other columns with lots of missing values (semester honors for instance),
# but we will keep it since it's actually valuable information.
columns_to_drop = ['PUID','Major 2','Major 3','Major 4','Minor 1','Minor 2','Minor 3','Minor 4',
                   '1st Concentration','Overall Credits Attempted','Nation of Citizenship','Residence Hall']
data.drop(columns_to_drop, axis=1, inplace=True)
data = data.drop(data[data['CoRec User'] == 'F'].index)
plt.rcParams.update({'font.size': 22})
plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
plt.rc('ytick', labelsize=20)    # fontsize of the tick labels

## corec2.py
from pandas_profiling import ProfileReport
profile = ProfileReport(raw_data)
profile # shows a report of the data - its features, distributions, correlations and so on.


data.columns # prints a list of columns

## corec1.py
# Importing all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
raw_data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1)
raw_data.head() # Let's take a quick look at the data

## RecCenter5.py
# We will create a simple table to summarize the computed constants and their errors
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
mydata = np.vstack([[res_ols.params], [res_wls_exp.params], [res_wls_svm.params]])
mydata = np.round(mydata,4)
headers = [ "a", "b" ]
rows = [ "OLS", "WLS (exp)", "WLS (SVM)" ]
tabl_1 = SimpleTable(mydata, headers, rows, txt_fmt=default_txt_fmt)
print(tabl_1)

mydata = np.vstack([[res_ols.bse], [res_wls_exp.bse], [res_wls_svm.bse]])

## RecCenter4.py
# We fit the residuals to an SVM regressor and then use the
# fitted model to predict on the same x values

rgr = SVR(C=10, epsilon=0.2)
rgr.fit(X[:,1].reshape(-1, 1), residuals.reshape(-1, 1))

w_svm = 1/rgr.predict(X[:,1].reshape(-1,1))
res_wls_svm = sm.WLS(y, X, weights=w_svm).fit()

## RecCenter3.py
w_exp = 1/exp_vals # Calculating the weights
res_wls_exp = sm.WLS(y, X, weights=w_exp).fit() # Statsmodels to fit the WLS

fig, ax = plt.subplots(figsize=(10,9))
plt.plot(X[:,1], y, 'o', label="Raw Data")
plt.plot(X[:,1], res_ols.fittedvalues, 'r--',label="OLS",LineWidth=4)
plt.plot(X[:,1], res_wls_exp.fittedvalues, 'g',label="WLS",LineWidth=4)
plt.ylim([1.5,4.1])
plt.legend(loc='lower right')
plt.show()

## RecCenter2.py
exp_params = scipy.optimize.curve_fit(lambda t,a,b: a*np.exp(b*t),
                         X[:,1], residuals,p0=(-4,-0.1) ) # outcome strongly dependent on the initial point given
exp_vals = exp_params[0][0] * np.exp(exp_params[0][1]*np.sort(X[:,1])) # Sorting so that we can draw a line

## RecCenter1.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
from sklearn.svm import SVR
import seaborn as sns

data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1)
data2 = data.sample(n=1000,random_state = 2) # Randomly sample 1000 data points
	fig, ax = plt.subplots(figsize=(10, 8))
	raw_data['International?'] = raw_data['International?'].str.lower()
	ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='International?',
	showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3")
	plt.show()
	# We will drop a bunch of columns that are mostly missing values( >50%), or are redundant.
	# There are a couple of other columns with lots of missing values (semester honors for instance),
	# but we will keep it since it's actually valuable information.
	columns_to_drop = ['PUID','Major 2','Major 3','Major 4','Minor 1','Minor 2','Minor 3','Minor 4',
	'1st Concentration','Overall Credits Attempted','Nation of Citizenship','Residence Hall']
	data.drop(columns_to_drop, axis=1, inplace=True)
	data = data.drop(data[data['CoRec User'] == 'F'].index)
	plt.rcParams.update({'font.size': 22})
	plt.rc('xtick', labelsize=20) # fontsize of the tick labels
	plt.rc('ytick', labelsize=20) # fontsize of the tick labels
	from pandas_profiling import ProfileReport
	profile = ProfileReport(raw_data)
	profile # shows a report of the data - its features, distributions, correlations and so on.


	data.columns # prints a list of columns
	# Importing all the libraries
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import LabelEncoder, OneHotEncoder
	from sklearn.preprocessing import StandardScaler
	import seaborn as sns
	raw_data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1)
	raw_data.head() # Let's take a quick look at the data
	# We will create a simple table to summarize the computed constants and their errors
	from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
	mydata = np.vstack([[res_ols.params], [res_wls_exp.params], [res_wls_svm.params]])
	mydata = np.round(mydata,4)
	headers = [ "a", "b" ]
	rows = [ "OLS", "WLS (exp)", "WLS (SVM)" ]
	tabl_1 = SimpleTable(mydata, headers, rows, txt_fmt=default_txt_fmt)
	print(tabl_1)

	mydata = np.vstack([[res_ols.bse], [res_wls_exp.bse], [res_wls_svm.bse]])
	# We fit the residuals to an SVM regressor and then use the
	# fitted model to predict on the same x values

	rgr = SVR(C=10, epsilon=0.2)
	rgr.fit(X[:,1].reshape(-1, 1), residuals.reshape(-1, 1))

	w_svm = 1/rgr.predict(X[:,1].reshape(-1,1))
	res_wls_svm = sm.WLS(y, X, weights=w_svm).fit()
	w_exp = 1/exp_vals # Calculating the weights
	res_wls_exp = sm.WLS(y, X, weights=w_exp).fit() # Statsmodels to fit the WLS

	fig, ax = plt.subplots(figsize=(10,9))
	plt.plot(X[:,1], y, 'o', label="Raw Data")
	plt.plot(X[:,1], res_ols.fittedvalues, 'r--',label="OLS",LineWidth=4)
	plt.plot(X[:,1], res_wls_exp.fittedvalues, 'g',label="WLS",LineWidth=4)
	plt.ylim([1.5,4.1])
	plt.legend(loc='lower right')
	plt.show()
	exp_params = scipy.optimize.curve_fit(lambda t,a,b: anp.exp(bt),
	X[:,1], residuals,p0=(-4,-0.1) ) # outcome strongly dependent on the initial point given
	exp_vals = exp_params[0][0] * np.exp(exp_params[0][1]*np.sort(X[:,1])) # Sorting so that we can draw a line