Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
def format_data(df):
# Targets are final grade of student
labels = df['G3']
# Drop the school and the grades from features
df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
# One-Hot Encoding of Categorical Variables
df = pd.get_dummies(df)
df['y'] = list(labels)
most_correlated = df.corr().abs()['y'].sort_values(ascending=False)
# Keep correlations greater than 0.2 in absolute value
most_correlated = most_correlated[most_correlated >= 0.2][1:]
df = df.ix[:, most_correlated.index]
# Already encode the higher education column in `higher_yes`
df = df.drop(columns = 'higher_no')
# Split into training/testing sets with 25% split
X_train, X_test, y_train, y_test = train_test_split(df, labels,
test_size = 0.25,
random_state=42)
# Return the training and testing data
return X_train, X_test, y_train, y_test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment