Skip to content

Instantly share code, notes, and snippets.

@Muhammad4hmed
Created August 9, 2021 19:37
Show Gist options
  • Save Muhammad4hmed/c999a153ee6bc740e0198a3ebec54f8a to your computer and use it in GitHub Desktop.
Save Muhammad4hmed/c999a153ee6bc740e0198a3ebec54f8a to your computer and use it in GitHub Desktop.
def groupby(train, test, col):
res = train.groupby(col)['Price'].agg(['min','mean','max'])
train[f'{col}_Min'] = train[col].map(res['min'])
train[f'{col}_Mean'] = train[col].map(res['mean'])
train[f'{col}_Max'] = train[col].map(res['max'])
test[f'{col}_Min'] = test[col].map(res['min'])
test[f'{col}_Mean'] = test[col].map(res['mean'])
test[f'{col}_Max'] = test[col].map(res['max'])
test.fillna(-999,axis=1,inplace=True)
return train, test
def drop_outliers(train):
# Q1 = train['Price'].quantile(0.25)
# Q3 = train['Price'].quantile(0.75)
# IQR = Q3 - Q1
# inds = train[(train['Price'] < (Q1 - 1.5 * IQR)) |(train['Price'] > (Q3 + 1.5 * IQR))].index
# train = train.drop(inds, axis=0).reset_index(drop = True)
train = train[~(train['Price'] < 5)].reset_index(drop = True)
return train, train['Price']
def two_col_groupby(train, test, col1, col2):
r = dict(train.groupby([col1,col2])['Price'].mean())
for key, val in zip(r.keys(), r.values()):
k1 = key[0]
k2 = key[1]
train.at[(train[col1] == k1) & (train[col2] == k2), f'Mean_{col1}_{col2}'] = val
test.at[(test[col1] == k1) & (test[col2] == k2), f'Mean_{col1}_{col2}'] = val
# r = dict(train.groupby([col1,col2])['Price'].min())
# for key, val in zip(r.keys(), r.values()):
# k1 = key[0]
# k2 = key[1]
# train.at[(train[col1] == k1) & (train[col2] == k2), f'min_{col1}_{col2}'] = val
# test.at[(test[col1] == k1) & (test[col2] == k2), f'min_{col1}_{col2}'] = val
# r = dict(train.groupby([col1,col2])['Price'].max())
# for key, val in zip(r.keys(), r.values()):
# k1 = key[0]
# k2 = key[1]
# train.at[(train[col1] == k1) & (train[col2] == k2), f'max_{col1}_{col2}'] = val
# test.at[(test[col1] == k1) & (test[col2] == k2), f'max_{col1}_{col2}'] = val
test.fillna(-999, inplace= True)
return train, test
def leak(train, test):
for (idx, price) in zip(train[train['ID'].isin(test['ID'])]['ID'],
train[train['ID'].isin(test['ID'])]['Price']):
test.at[test['ID']==idx, 'Price'] = price
return train, test
def FE(train, test):
train['Turbo'] = train['Engine volume'].apply(is_turbo)
test['Turbo'] = test['Engine volume'].apply(is_turbo)
train['Engine volume'] = train['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))
test['Engine volume'] = test['Engine volume'].apply(lambda x: float(str(x).replace('Turbo','')))
# train['MMP'] = train['Manufacturer'] + '_' + train['Model'] + '_' + train['Prod. year'].astype('str')
# test['MMP'] = test['Manufacturer'] + '_' + test['Model'] + '_' + test['Prod. year'].astype('str')
# train['ID_P1'] = train['ID'].apply(lambda x: int(str(x)[:2]))
# test['ID_P1'] = test['ID'].apply(lambda x: int(str(x)[:2]))
# train['Km/year'] = train['Mileage']/(2021 - train['Prod. year'])
# test['Km/year'] = test['Mileage']/(2021 - test['Prod. year'])
# train['old'] = 2021 - train['Prod. year']
# test['old'] = 2021 - test['Prod. year']
# train['em'] = train['Engine volume'] * train['Mileage']
# test['em'] = test['Engine volume'] * test['Mileage']
for col in ['Prod. year', 'Fuel type', 'Gear box type', 'Drive wheels', 'Levy', 'Airbags']:
train, test = groupby(train, test, col)
# train, y_train = drop_outliers(train)
train, test = two_col_groupby(train, test, 'Prod. year','Category')
train, test = two_col_groupby(train, test, 'Prod. year','Fuel type')
return train.drop(['Price'],axis=1), test
def enc_cols(train, y_train, test, cols):
enc = TargetEncoder(cols = cols)
train[cols] = enc.fit_transform(train[cols], y_train)
test[cols] = enc.transform(test[cols])
return train, test
from sklearn.model_selection import KFold
def cross_val(X, y, models, weights, metric = rmlse, folds = 10):
scores = []
best_model = None
best_sc = 10000
for tr_in, val_in in KFold(n_splits = folds).split(X, y):
X_train, y_train, X_val, y_val = X.iloc[tr_in,:], y[tr_in], X.iloc[val_in,:], y[val_in]
X_train, X_val = FE(pd.concat([X_train, pd.DataFrame(y_train,columns=['Price'])],axis=1), X_val)
cols = X_train.columns[X_train.dtypes=='object']
X_train, X_val = enc_cols(X_train, y_train, X_val, cols)
cols = ['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year', 'Engine volume',
'Mileage', 'Cylinders', 'Gear box type', 'Doors', 'Airbags', 'Turbo',
'Gear box type_Mean', 'Levy_Min', 'Levy_Mean', 'Levy_Max',
'Airbags_Mean', 'Airbags_Max', 'Mean_Prod. year_Category',
'Mean_Prod. year_Fuel type']
# X_train = X_train[cols]
# X_val = X_val[cols]
# id_train, id_test = X_train['ID'].copy(), X_val['ID'].copy()
# X_train.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
# X_val.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True)
y_hat = np.zeros(y_val.shape)
for model,weight in zip(models,weights):
model.fit(X_train, y_train)
y_hat = y_hat + (model.predict(X_val) * weight)
X_train['Price'] = y_train
X_val['Price'] = y_hat
# X_train['ID'], X_val['ID'] = id_train, id_test
X_train, X_test = leak(X_train, X_val)
sc = metric(y_val,X_val['Price'])
print(sc)
if sc < best_sc:
best_sc = sc
best_model = model
scores.append(sc)
return np.mean(scores), best_model, X_train
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment