neelriyer/mealkit_preprocessing.py

## mealkit_preprocessing.py
# Fill Missing values
# Encode categorical variables
# Normalize continous variables

procs=[FillMissing, Categorify, Normalize]
cont_vars = [i for i in [‘checkout_price’,
 ‘base_price’,
 ‘Elapsed’,
 ‘week_sin’,
 ‘week_cos’,
 ‘price_diff_percent’] if i in train_df.columns and i in test_df.columns]

cat_vars = [i for i in [‘week’, ‘center_id’, ‘meal_id’,
 ‘emailer_for_promotion’, ‘homepage_featured’,
 ‘category’, ‘cuisine’, ‘city_code’, ‘region_code’, ‘center_type’,
 ‘op_area’, ‘Year’, ‘Month’, ‘Week’, ‘Day’, ‘Dayofweek’, ‘Dayofyear’,
 ‘Is_month_end’, ‘Is_month_start’, ‘Is_quarter_end’, ‘Is_quarter_start’,
 ‘Is_year_end’, ‘Is_year_start’,
 ‘email_plus_homepage’] if i in train_df.columns and i in test_df.columns]

dep_var = ‘num_orders’
df = train_df[cat_vars + cont_vars + [dep_var,’Date’]].copy()
bs = 2**11 # max this out
path = Path(‘.’)

# create tabular data bunch
# validation set will be 5000 rows (ordered)
# label cls
data = (TabularList.from_df(df, cat_names=cat_vars, cont_names=cont_vars, procs=procs)
 .split_by_idx(list(range(1000,1000+5000)))
 .label_from_df(cols=dep_var, label_cls=FloatList, log = True)
 .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs = procs))
 .databunch(bs=bs))
	# Fill Missing values
	# Encode categorical variables
	# Normalize continous variables

	procs=[FillMissing, Categorify, Normalize]
	cont_vars = [i for i in [‘checkout_price’,
	‘base_price’,
	‘Elapsed’,
	‘week_sin’,
	‘week_cos’,
	‘price_diff_percent’] if i in train_df.columns and i in test_df.columns]

	cat_vars = [i for i in [‘week’, ‘center_id’, ‘meal_id’,
	‘emailer_for_promotion’, ‘homepage_featured’,
	‘category’, ‘cuisine’, ‘city_code’, ‘region_code’, ‘center_type’,
	‘op_area’, ‘Year’, ‘Month’, ‘Week’, ‘Day’, ‘Dayofweek’, ‘Dayofyear’,
	‘Is_month_end’, ‘Is_month_start’, ‘Is_quarter_end’, ‘Is_quarter_start’,
	‘Is_year_end’, ‘Is_year_start’,
	‘email_plus_homepage’] if i in train_df.columns and i in test_df.columns]

	dep_var = ‘num_orders’
	df = train_df[cat_vars + cont_vars + [dep_var,’Date’]].copy()
	bs = 2**11 # max this out
	path = Path(‘.’)

	# create tabular data bunch
	# validation set will be 5000 rows (ordered)
	# label cls
	data = (TabularList.from_df(df, cat_names=cat_vars, cont_names=cont_vars, procs=procs)
	.split_by_idx(list(range(1000,1000+5000)))
	.label_from_df(cols=dep_var, label_cls=FloatList, log = True)
	.add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs = procs))
	.databunch(bs=bs))