MattMcMurray/splitdata.py

## splitdata.py
from sklearn.model_selection import StratifiedShuffleSplit

# Let's create an age category
age_cat = np.ceil(appt_data['Age'] / 10)

# Let's group anybody >100yrs old into the 100 year old category, as they are outliers
age_cat.where(age_cat < 100, 100, inplace=True)

appt_data['AgeCategory'] = age_cat

# Create a test set that is 20% of all values
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(appt_data, appt_data['AgeCategory']):
    strat_train_set = appt_data.loc[train_index]
    strat_test_set = appt_data.loc[test_index]
	from sklearn.model_selection import StratifiedShuffleSplit

	# Let's create an age category
	age_cat = np.ceil(appt_data['Age'] / 10)

	# Let's group anybody >100yrs old into the 100 year old category, as they are outliers
	age_cat.where(age_cat < 100, 100, inplace=True)

	appt_data['AgeCategory'] = age_cat

	# Create a test set that is 20% of all values
	split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	for train_index, test_index in split.split(appt_data, appt_data['AgeCategory']):
	strat_train_set = appt_data.loc[train_index]
	strat_test_set = appt_data.loc[test_index]