Skip to content

Instantly share code, notes, and snippets.

y = data.iloc[:,-1]
le = LabelEncoder()
y = le.fit_transform(y)
y = pd.DataFrame(y,columns=['binnedgrade'])
X = data.iloc[:,:-1]
X = pd.get_dummies(X, prefix_sep='_')
scaler = StandardScaler()
data['Geocluster'].fillna(data['Geocluster'].mode().iloc[0],inplace=True)
data['Academic School Grouping'].fillna(data['Academic School Grouping'].mode().iloc[0],inplace=True)
data['Program'].fillna(data['Program'].mode().iloc[0],inplace=True)
data['Spring Credits Attempted'].fillna(data['Spring Credits Attempted'].median(),inplace=True)
data['Spring Credits Earned'].fillna(data['Spring Credits Earned'].median(),inplace=True)
data.isnull().sum()
cat = len(data.select_dtypes(include=['object']).columns)
num = len(data.select_dtypes(include=['int64','float64']).columns)
print('Total Features: ', cat, 'categorical', '+',
num, 'numerical', '=', cat+num, 'features')
bins = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
labels = ['A','B','C','D','E','F','G','H']
data['binnedgrade'] = pd.cut(data['Overall GPA'], bins=bins, labels=labels).astype(str)
swipe_cols = [col for col in data.columns if 'GPA' in col]
data.drop(swipe_cols, axis=1, inplace=True)
data
buffer = data
buffer['Semester Honors'] = buffer['Semester Honors'].astype(str)
buffer['Semester Honors'].replace('nan','No honors',inplace=True)
subset_honors = buffer.groupby('Semester Honors', as_index=False)['Year Swipes'].mean()
subset_honors2 = buffer.groupby('Semester Honors', as_index=False)['Overall GPA'].mean()
subset_honors['Overall GPA'] = subset_honors2['Overall GPA']
subset_honors
fig = plt.subplots(figsize=(10, 8))
subset_year = data.groupby('Year', as_index=False)['Year Swipes'].mean()
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(x="Year",y="Year Swipes",data=subset_year,order=['Freshmen','Sophomore','Junior','Senior','Graduate',])
plt.show()
fig, ax = plt.subplots(figsize=(10, 8))
sns.distplot(data['Overall GPA'])
plt.show()
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.scatterplot(x='Year Swipes', y='Overall GPA',data=data)
plt.show()
insubset = data.loc[data['International?'] == 'international']
domsubset = data.loc[data['International?'] == 'domestic']
ingpa = insubset.groupby('Type of User', as_index=False)['Overall GPA'].mean()
domgpa = domsubset.groupby('Type of User', as_index=False)['Overall GPA'].mean()
print ("International students")
print (ingpa)
print ("Domestic students")
print (domgpa)