Muhammad4hmed/Tabular_Utils.py

## Tabular_Utils.py
def groupby(train, test, col):
    res = train.groupby(col)['Price'].agg(['min','mean','max'])

    train[f'{col}_Min'] = train[col].map(res['min'])
    train[f'{col}_Mean'] = train[col].map(res['mean'])
    train[f'{col}_Max'] = train[col].map(res['max'])

    test[f'{col}_Min'] = test[col].map(res['min'])
    test[f'{col}_Mean'] = test[col].map(res['mean'])
    test[f'{col}_Max'] = test[col].map(res['max'])

    test.fillna(-999,axis=1,inplace=True)

    return train, test
def drop_outliers(train):
#     Q1 = train['Price'].quantile(0.25)
#     Q3 = train['Price'].quantile(0.75)
#     IQR = Q3 - Q1
#     inds = train[(train['Price'] < (Q1 - 1.5 * IQR)) |(train['Price'] > (Q3 + 1.5 * IQR))].index
#     train = train.drop(inds, axis=0).reset_index(drop = True)
    train = train[~(train['Price'] < 5)].reset_index(drop = True)
    return train, train['Price']

def two_col_groupby(train, test, col1, col2):
    r = dict(train.groupby([col1,col2])['Price'].mean())
    for key, val in zip(r.keys(), r.values()):
        k1 = key[0]
        k2 = key[1]
        train.at[(train[col1] == k1) & (train[col2] == k2), f'Mean_{col1}_{col2}'] = val
        test.at[(test[col1] == k1) & (test[col2] == k2), f'Mean_{col1}_{col2}'] = val

#     r = dict(train.groupby([col1,col2])['Price'].min())
#     for key, val in zip(r.keys(), r.values()):
#         k1 = key[0]
#         k2 = key[1]
#         train.at[(train[col1] == k1) & (train[col2] == k2), f'min_{col1}_{col2}'] = val
#         test.at[(test[col1] == k1) & (test[col2] == k2), f'min_{col1}_{col2}'] = val

#     r = dict(train.groupby([col1,col2])['Price'].max())
#     for key, val in zip(r.keys(), r.values()):
#         k1 = key[0]
#         k2 = key[1]
#         train.at[(train[col1] == k1) & (train[col2] == k2), f'max_{col1}_{col2}'] = val
#         test.at[(test[col1] == k1) & (test[col2] == k2), f'max_{col1}_{col2}'] = val

    test.fillna(-999, inplace= True)
    return train, test

def leak(train, test):
    for (idx, price) in zip(train[train['ID'].isin(test['ID'])]['ID'],
                           train[train['ID'].isin(test['ID'])]['Price']):
        test.at[test['ID']==idx, 'Price'] = price

    return train, test
def FE(train, test):
    train['Turbo'] = train['Engine volume'].apply(is_turbo)
    test['Turbo'] = test['Engine volume'].apply(is_turbo)
    train['Engine volume'] = train['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))
    test['Engine volume'] = test['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))

#     train['MMP'] = train['Manufacturer'] + '_' + train['Model'] + '_' +  train['Prod. year'].astype('str')
#     test['MMP'] = test['Manufacturer'] + '_' + test['Model'] + '_' +  test['Prod. year'].astype('str')

#     train['ID_P1'] = train['ID'].apply(lambda x: int(str(x)[:2]))
#     test['ID_P1'] = test['ID'].apply(lambda x: int(str(x)[:2]))

#     train['Km/year'] = train['Mileage']/(2021 - train['Prod. year'])
#     test['Km/year'] = test['Mileage']/(2021 - test['Prod. year'])

#     train['old'] = 2021 - train['Prod. year']
#     test['old'] = 2021 - test['Prod. year']

#     train['em'] = train['Engine volume'] * train['Mileage']
#     test['em'] = test['Engine volume'] * test['Mileage']

    for col in ['Prod. year', 'Fuel type', 'Gear box type', 'Drive wheels', 'Levy', 'Airbags']:
        train, test = groupby(train, test, col)

#     train, y_train = drop_outliers(train)
    train, test = two_col_groupby(train, test, 'Prod. year','Category')
    train, test = two_col_groupby(train, test, 'Prod. year','Fuel type')

    return train.drop(['Price'],axis=1), test


def enc_cols(train, y_train, test, cols):
    enc = TargetEncoder(cols = cols)

    train[cols] = enc.fit_transform(train[cols], y_train)
    test[cols] = enc.transform(test[cols])

    return train, test


from sklearn.model_selection import KFold
def cross_val(X, y, models, weights, metric = rmlse, folds = 10):
    scores = []
    best_model = None
    best_sc = 10000
    for tr_in, val_in in KFold(n_splits = folds).split(X, y):

        X_train, y_train, X_val, y_val = X.iloc[tr_in,:], y[tr_in], X.iloc[val_in,:], y[val_in]

        X_train, X_val = FE(pd.concat([X_train, pd.DataFrame(y_train,columns=['Price'])],axis=1), X_val)

        cols = X_train.columns[X_train.dtypes=='object']
        X_train, X_val = enc_cols(X_train, y_train, X_val, cols)

        cols = ['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year', 'Engine volume',
       'Mileage', 'Cylinders', 'Gear box type', 'Doors', 'Airbags', 'Turbo',
       'Gear box type_Mean', 'Levy_Min', 'Levy_Mean', 'Levy_Max',
       'Airbags_Mean', 'Airbags_Max', 'Mean_Prod. year_Category',
       'Mean_Prod. year_Fuel type']

#         X_train = X_train[cols]
#         X_val = X_val[cols]


#         id_train, id_test = X_train['ID'].copy(), X_val['ID'].copy()
#         X_train.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
#         X_val.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
        y_hat = np.zeros(y_val.shape)
        for model,weight in zip(models,weights):
            model.fit(X_train, y_train)
            y_hat = y_hat + (model.predict(X_val) * weight)

        X_train['Price'] = y_train
        X_val['Price'] = y_hat

#         X_train['ID'], X_val['ID'] = id_train, id_test
        X_train, X_test = leak(X_train, X_val)
        sc = metric(y_val,X_val['Price'])
        print(sc)
        if sc < best_sc:
            best_sc = sc
            best_model = model
        scores.append(sc)
    return np.mean(scores), best_model, X_train
	def groupby(train, test, col):
	res = train.groupby(col)['Price'].agg(['min','mean','max'])

	train[f'{col}_Min'] = train[col].map(res['min'])
	train[f'{col}_Mean'] = train[col].map(res['mean'])
	train[f'{col}_Max'] = train[col].map(res['max'])

	test[f'{col}_Min'] = test[col].map(res['min'])
	test[f'{col}_Mean'] = test[col].map(res['mean'])
	test[f'{col}_Max'] = test[col].map(res['max'])

	test.fillna(-999,axis=1,inplace=True)

	return train, test
	def drop_outliers(train):
	# Q1 = train['Price'].quantile(0.25)
	# Q3 = train['Price'].quantile(0.75)
	# IQR = Q3 - Q1
	# inds = train[(train['Price'] < (Q1 - 1.5 * IQR)) \|(train['Price'] > (Q3 + 1.5 * IQR))].index
	# train = train.drop(inds, axis=0).reset_index(drop = True)
	train = train[~(train['Price'] < 5)].reset_index(drop = True)
	return train, train['Price']

	def two_col_groupby(train, test, col1, col2):
	r = dict(train.groupby([col1,col2])['Price'].mean())
	for key, val in zip(r.keys(), r.values()):
	k1 = key[0]
	k2 = key[1]
	train.at[(train[col1] == k1) & (train[col2] == k2), f'Mean_{col1}_{col2}'] = val
	test.at[(test[col1] == k1) & (test[col2] == k2), f'Mean_{col1}_{col2}'] = val

	# r = dict(train.groupby([col1,col2])['Price'].min())
	# for key, val in zip(r.keys(), r.values()):
	# k1 = key[0]
	# k2 = key[1]
	# train.at[(train[col1] == k1) & (train[col2] == k2), f'min_{col1}_{col2}'] = val
	# test.at[(test[col1] == k1) & (test[col2] == k2), f'min_{col1}_{col2}'] = val

	# r = dict(train.groupby([col1,col2])['Price'].max())
	# for key, val in zip(r.keys(), r.values()):
	# k1 = key[0]
	# k2 = key[1]
	# train.at[(train[col1] == k1) & (train[col2] == k2), f'max_{col1}_{col2}'] = val
	# test.at[(test[col1] == k1) & (test[col2] == k2), f'max_{col1}_{col2}'] = val

	test.fillna(-999, inplace= True)
	return train, test

	def leak(train, test):
	for (idx, price) in zip(train[train['ID'].isin(test['ID'])]['ID'],
	train[train['ID'].isin(test['ID'])]['Price']):
	test.at[test['ID']==idx, 'Price'] = price

	return train, test
	def FE(train, test):
	train['Turbo'] = train['Engine volume'].apply(is_turbo)
	test['Turbo'] = test['Engine volume'].apply(is_turbo)
	train['Engine volume'] = train['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))
	test['Engine volume'] = test['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))

	# train['MMP'] = train['Manufacturer'] + '_' + train['Model'] + '_' + train['Prod. year'].astype('str')
	# test['MMP'] = test['Manufacturer'] + '_' + test['Model'] + '_' + test['Prod. year'].astype('str')

	# train['ID_P1'] = train['ID'].apply(lambda x: int(str(x)[:2]))
	# test['ID_P1'] = test['ID'].apply(lambda x: int(str(x)[:2]))

	# train['Km/year'] = train['Mileage']/(2021 - train['Prod. year'])
	# test['Km/year'] = test['Mileage']/(2021 - test['Prod. year'])

	# train['old'] = 2021 - train['Prod. year']
	# test['old'] = 2021 - test['Prod. year']

	# train['em'] = train['Engine volume'] * train['Mileage']
	# test['em'] = test['Engine volume'] * test['Mileage']

	for col in ['Prod. year', 'Fuel type', 'Gear box type', 'Drive wheels', 'Levy', 'Airbags']:
	train, test = groupby(train, test, col)

	# train, y_train = drop_outliers(train)
	train, test = two_col_groupby(train, test, 'Prod. year','Category')
	train, test = two_col_groupby(train, test, 'Prod. year','Fuel type')

	return train.drop(['Price'],axis=1), test


	def enc_cols(train, y_train, test, cols):
	enc = TargetEncoder(cols = cols)

	train[cols] = enc.fit_transform(train[cols], y_train)
	test[cols] = enc.transform(test[cols])

	return train, test


	from sklearn.model_selection import KFold
	def cross_val(X, y, models, weights, metric = rmlse, folds = 10):
	scores = []
	best_model = None
	best_sc = 10000
	for tr_in, val_in in KFold(n_splits = folds).split(X, y):

	X_train, y_train, X_val, y_val = X.iloc[tr_in,:], y[tr_in], X.iloc[val_in,:], y[val_in]

	X_train, X_val = FE(pd.concat([X_train, pd.DataFrame(y_train,columns=['Price'])],axis=1), X_val)

	cols = X_train.columns[X_train.dtypes=='object']
	X_train, X_val = enc_cols(X_train, y_train, X_val, cols)

	cols = ['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year', 'Engine volume',
	'Mileage', 'Cylinders', 'Gear box type', 'Doors', 'Airbags', 'Turbo',
	'Gear box type_Mean', 'Levy_Min', 'Levy_Mean', 'Levy_Max',
	'Airbags_Mean', 'Airbags_Max', 'Mean_Prod. year_Category',
	'Mean_Prod. year_Fuel type']

	# X_train = X_train[cols]
	# X_val = X_val[cols]


	# id_train, id_test = X_train['ID'].copy(), X_val['ID'].copy()
	# X_train.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
	# X_val.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
	y_hat = np.zeros(y_val.shape)
	for model,weight in zip(models,weights):
	model.fit(X_train, y_train)
	y_hat = y_hat + (model.predict(X_val) * weight)

	X_train['Price'] = y_train
	X_val['Price'] = y_hat

	# X_train['ID'], X_val['ID'] = id_train, id_test
	X_train, X_test = leak(X_train, X_val)
	sc = metric(y_val,X_val['Price'])
	print(sc)
	if sc < best_sc:
	best_sc = sc
	best_model = model
	scores.append(sc)
	return np.mean(scores), best_model, X_train