Skip to content

Instantly share code, notes, and snippets.

fig, ax = plt.subplots(figsize=(10, 8))
raw_data['International?'] = raw_data['International?'].str.lower()
ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='International?',
showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3")
plt.show()
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='Gender',
showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3")
plt.show()
# We will drop a bunch of columns that are mostly missing values( >50%), or are redundant.
# There are a couple of other columns with lots of missing values (semester honors for instance),
# but we will keep it since it's actually valuable information.
columns_to_drop = ['PUID','Major 2','Major 3','Major 4','Minor 1','Minor 2','Minor 3','Minor 4',
'1st Concentration','Overall Credits Attempted','Nation of Citizenship','Residence Hall']
data.drop(columns_to_drop, axis=1, inplace=True)
data = data.drop(data[data['CoRec User'] == 'F'].index)
plt.rcParams.update({'font.size': 22})
plt.rc('xtick', labelsize=20) # fontsize of the tick labels
plt.rc('ytick', labelsize=20) # fontsize of the tick labels
from pandas_profiling import ProfileReport
profile = ProfileReport(raw_data)
profile # shows a report of the data - its features, distributions, correlations and so on.
data.columns # prints a list of columns
# Importing all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
raw_data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1)
raw_data.head() # Let's take a quick look at the data
# We will create a simple table to summarize the computed constants and their errors
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
mydata = np.vstack([[res_ols.params], [res_wls_exp.params], [res_wls_svm.params]])
mydata = np.round(mydata,4)
headers = [ "a", "b" ]
rows = [ "OLS", "WLS (exp)", "WLS (SVM)" ]
tabl_1 = SimpleTable(mydata, headers, rows, txt_fmt=default_txt_fmt)
print(tabl_1)
mydata = np.vstack([[res_ols.bse], [res_wls_exp.bse], [res_wls_svm.bse]])
# We fit the residuals to an SVM regressor and then use the
# fitted model to predict on the same x values
rgr = SVR(C=10, epsilon=0.2)
rgr.fit(X[:,1].reshape(-1, 1), residuals.reshape(-1, 1))
w_svm = 1/rgr.predict(X[:,1].reshape(-1,1))
res_wls_svm = sm.WLS(y, X, weights=w_svm).fit()
w_exp = 1/exp_vals # Calculating the weights
res_wls_exp = sm.WLS(y, X, weights=w_exp).fit() # Statsmodels to fit the WLS
fig, ax = plt.subplots(figsize=(10,9))
plt.plot(X[:,1], y, 'o', label="Raw Data")
plt.plot(X[:,1], res_ols.fittedvalues, 'r--',label="OLS",LineWidth=4)
plt.plot(X[:,1], res_wls_exp.fittedvalues, 'g',label="WLS",LineWidth=4)
plt.ylim([1.5,4.1])
plt.legend(loc='lower right')
plt.show()
exp_params = scipy.optimize.curve_fit(lambda t,a,b: a*np.exp(b*t),
X[:,1], residuals,p0=(-4,-0.1) ) # outcome strongly dependent on the initial point given
exp_vals = exp_params[0][0] * np.exp(exp_params[0][1]*np.sort(X[:,1])) # Sorting so that we can draw a line
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy
from sklearn.svm import SVR
import seaborn as sns
data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1)
data2 = data.sample(n=1000,random_state = 2) # Randomly sample 1000 data points