micahmelling/modeling_custom_train_test Secret

## modeling_custom_train_test
def create_custom_train_test_split(df, target, test_set_percent, individual_id):
    """
    Creates a custom train-test split to ensure that players used for training are not also used in the holdout
    test set. This will help ensure our test set is not leaked into our training data in any way.
    :param df: pandas dataframe
    :param target: the target column
    :param test_set_percent: the percent of individuals to reserve for the test set
    :param individual_id: the id column to uniquely identify individual players
    :return: dataframes for x_train, y_train, x_test, y_test
    """

    test_set_n = int(df[individual_id].nunique() * test_set_percent)
    unique_ids = list(set(df[individual_id].tolist()))
    test_set_ids = random.sample(unique_ids, test_set_n)
    train_df = df.loc[~df[individual_id].isin(test_set_ids)]
    train_df.reset_index(inplace=True, drop=True)
    test_df = df.loc[df[individual_id].isin(test_set_ids)]
    test_df.reset_index(inplace=True, drop=True)
    y_train = train_df[target]
    y_test = test_df[target]
    x_train = train_df.drop(target, 1)
    x_test = test_df.drop(target, 1)
    return x_train, y_train, x_test, y_test
	def create_custom_train_test_split(df, target, test_set_percent, individual_id):
	"""
	Creates a custom train-test split to ensure that players used for training are not also used in the holdout
	test set. This will help ensure our test set is not leaked into our training data in any way.
	:param df: pandas dataframe
	:param target: the target column
	:param test_set_percent: the percent of individuals to reserve for the test set
	:param individual_id: the id column to uniquely identify individual players
	:return: dataframes for x_train, y_train, x_test, y_test
	"""

	test_set_n = int(df[individual_id].nunique() * test_set_percent)
	unique_ids = list(set(df[individual_id].tolist()))
	test_set_ids = random.sample(unique_ids, test_set_n)
	train_df = df.loc[~df[individual_id].isin(test_set_ids)]
	train_df.reset_index(inplace=True, drop=True)
	test_df = df.loc[df[individual_id].isin(test_set_ids)]
	test_df.reset_index(inplace=True, drop=True)
	y_train = train_df[target]
	y_test = test_df[target]
	x_train = train_df.drop(target, 1)
	x_test = test_df.drop(target, 1)
	return x_train, y_train, x_test, y_test