WillKoehrsen/data_preparation.py

## data_preparation.py
def format_data(df):
    # Targets are final grade of student
    labels = df['G3']
    # Drop the school and the grades from features
    df = df.drop(columns=['school', 'G1', 'G2', 'G3'])

    # One-Hot Encoding of Categorical Variables
    df = pd.get_dummies(df)

    df['y'] = list(labels)

    most_correlated = df.corr().abs()['y'].sort_values(ascending=False)

    # Keep correlations greater than 0.2 in absolute value
    most_correlated = most_correlated[most_correlated >= 0.2][1:]

    df = df.ix[:, most_correlated.index]

    # Already encode the higher education column in `higher_yes`
    df = df.drop(columns = 'higher_no')

    # Split into training/testing sets with 25% split
    X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                        test_size = 0.25,
                                                       random_state=42)

    # Return the training and testing data
    return X_train, X_test, y_train, y_test
	def format_data(df):
	# Targets are final grade of student
	labels = df['G3']
	# Drop the school and the grades from features
	df = df.drop(columns=['school', 'G1', 'G2', 'G3'])

	# One-Hot Encoding of Categorical Variables
	df = pd.get_dummies(df)

	df['y'] = list(labels)

	most_correlated = df.corr().abs()['y'].sort_values(ascending=False)

	# Keep correlations greater than 0.2 in absolute value
	most_correlated = most_correlated[most_correlated >= 0.2][1:]

	df = df.ix[:, most_correlated.index]

	# Already encode the higher education column in `higher_yes`
	df = df.drop(columns = 'higher_no')

	# Split into training/testing sets with 25% split
	X_train, X_test, y_train, y_test = train_test_split(df, labels,
	test_size = 0.25,
	random_state=42)

	# Return the training and testing data
	return X_train, X_test, y_train, y_test