This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| raw_data['International?'] = raw_data['International?'].str.lower() | |
| ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='International?', | |
| showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3") | |
| plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| ax = sns.boxplot(x="Type of User", y="Overall GPA", data=raw_data, hue='Gender', | |
| showfliers=False, order=["Infrequent", "Moderate", "Heavy"],palette="Set3") | |
| plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # We will drop a bunch of columns that are mostly missing values( >50%), or are redundant. | |
| # There are a couple of other columns with lots of missing values (semester honors for instance), | |
| # but we will keep it since it's actually valuable information. | |
| columns_to_drop = ['PUID','Major 2','Major 3','Major 4','Minor 1','Minor 2','Minor 3','Minor 4', | |
| '1st Concentration','Overall Credits Attempted','Nation of Citizenship','Residence Hall'] | |
| data.drop(columns_to_drop, axis=1, inplace=True) | |
| data = data.drop(data[data['CoRec User'] == 'F'].index) | |
| plt.rcParams.update({'font.size': 22}) | |
| plt.rc('xtick', labelsize=20) # fontsize of the tick labels | |
| plt.rc('ytick', labelsize=20) # fontsize of the tick labels |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pandas_profiling import ProfileReport | |
| profile = ProfileReport(raw_data) | |
| profile # shows a report of the data - its features, distributions, correlations and so on. | |
| data.columns # prints a list of columns |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Importing all the libraries | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
| from sklearn.preprocessing import StandardScaler | |
| import seaborn as sns | |
| raw_data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1) | |
| raw_data.head() # Let's take a quick look at the data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # We will create a simple table to summarize the computed constants and their errors | |
| from statsmodels.iolib.table import (SimpleTable, default_txt_fmt) | |
| mydata = np.vstack([[res_ols.params], [res_wls_exp.params], [res_wls_svm.params]]) | |
| mydata = np.round(mydata,4) | |
| headers = [ "a", "b" ] | |
| rows = [ "OLS", "WLS (exp)", "WLS (SVM)" ] | |
| tabl_1 = SimpleTable(mydata, headers, rows, txt_fmt=default_txt_fmt) | |
| print(tabl_1) | |
| mydata = np.vstack([[res_ols.bse], [res_wls_exp.bse], [res_wls_svm.bse]]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # We fit the residuals to an SVM regressor and then use the | |
| # fitted model to predict on the same x values | |
| rgr = SVR(C=10, epsilon=0.2) | |
| rgr.fit(X[:,1].reshape(-1, 1), residuals.reshape(-1, 1)) | |
| w_svm = 1/rgr.predict(X[:,1].reshape(-1,1)) | |
| res_wls_svm = sm.WLS(y, X, weights=w_svm).fit() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| w_exp = 1/exp_vals # Calculating the weights | |
| res_wls_exp = sm.WLS(y, X, weights=w_exp).fit() # Statsmodels to fit the WLS | |
| fig, ax = plt.subplots(figsize=(10,9)) | |
| plt.plot(X[:,1], y, 'o', label="Raw Data") | |
| plt.plot(X[:,1], res_ols.fittedvalues, 'r--',label="OLS",LineWidth=4) | |
| plt.plot(X[:,1], res_wls_exp.fittedvalues, 'g',label="WLS",LineWidth=4) | |
| plt.ylim([1.5,4.1]) | |
| plt.legend(loc='lower right') | |
| plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| exp_params = scipy.optimize.curve_fit(lambda t,a,b: a*np.exp(b*t), | |
| X[:,1], residuals,p0=(-4,-0.1) ) # outcome strongly dependent on the initial point given | |
| exp_vals = exp_params[0][0] * np.exp(exp_params[0][1]*np.sort(X[:,1])) # Sorting so that we can draw a line |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import statsmodels.api as sm | |
| import scipy | |
| from sklearn.svm import SVR | |
| import seaborn as sns | |
| data = pd.read_excel('Anonymized - 2017 Summer, 2017 Fall, 2018 Spring CoRec Swipe Data.xlsx',header=1) | |
| data2 = data.sample(n=1000,random_state = 2) # Randomly sample 1000 data points |