Created
August 9, 2021 19:37
-
-
Save Muhammad4hmed/c999a153ee6bc740e0198a3ebec54f8a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def groupby(train, test, col): | |
res = train.groupby(col)['Price'].agg(['min','mean','max']) | |
train[f'{col}_Min'] = train[col].map(res['min']) | |
train[f'{col}_Mean'] = train[col].map(res['mean']) | |
train[f'{col}_Max'] = train[col].map(res['max']) | |
test[f'{col}_Min'] = test[col].map(res['min']) | |
test[f'{col}_Mean'] = test[col].map(res['mean']) | |
test[f'{col}_Max'] = test[col].map(res['max']) | |
test.fillna(-999,axis=1,inplace=True) | |
return train, test | |
def drop_outliers(train): | |
# Q1 = train['Price'].quantile(0.25) | |
# Q3 = train['Price'].quantile(0.75) | |
# IQR = Q3 - Q1 | |
# inds = train[(train['Price'] < (Q1 - 1.5 * IQR)) |(train['Price'] > (Q3 + 1.5 * IQR))].index | |
# train = train.drop(inds, axis=0).reset_index(drop = True) | |
train = train[~(train['Price'] < 5)].reset_index(drop = True) | |
return train, train['Price'] | |
def two_col_groupby(train, test, col1, col2): | |
r = dict(train.groupby([col1,col2])['Price'].mean()) | |
for key, val in zip(r.keys(), r.values()): | |
k1 = key[0] | |
k2 = key[1] | |
train.at[(train[col1] == k1) & (train[col2] == k2), f'Mean_{col1}_{col2}'] = val | |
test.at[(test[col1] == k1) & (test[col2] == k2), f'Mean_{col1}_{col2}'] = val | |
# r = dict(train.groupby([col1,col2])['Price'].min()) | |
# for key, val in zip(r.keys(), r.values()): | |
# k1 = key[0] | |
# k2 = key[1] | |
# train.at[(train[col1] == k1) & (train[col2] == k2), f'min_{col1}_{col2}'] = val | |
# test.at[(test[col1] == k1) & (test[col2] == k2), f'min_{col1}_{col2}'] = val | |
# r = dict(train.groupby([col1,col2])['Price'].max()) | |
# for key, val in zip(r.keys(), r.values()): | |
# k1 = key[0] | |
# k2 = key[1] | |
# train.at[(train[col1] == k1) & (train[col2] == k2), f'max_{col1}_{col2}'] = val | |
# test.at[(test[col1] == k1) & (test[col2] == k2), f'max_{col1}_{col2}'] = val | |
test.fillna(-999, inplace= True) | |
return train, test | |
def leak(train, test): | |
for (idx, price) in zip(train[train['ID'].isin(test['ID'])]['ID'], | |
train[train['ID'].isin(test['ID'])]['Price']): | |
test.at[test['ID']==idx, 'Price'] = price | |
return train, test | |
def FE(train, test): | |
train['Turbo'] = train['Engine volume'].apply(is_turbo) | |
test['Turbo'] = test['Engine volume'].apply(is_turbo) | |
train['Engine volume'] = train['Engine volume'].apply(lambda x: float(str(x).replace('Turbo',''))) | |
test['Engine volume'] = test['Engine volume'].apply(lambda x: float(str(x).replace('Turbo',''))) | |
# train['MMP'] = train['Manufacturer'] + '_' + train['Model'] + '_' + train['Prod. year'].astype('str') | |
# test['MMP'] = test['Manufacturer'] + '_' + test['Model'] + '_' + test['Prod. year'].astype('str') | |
# train['ID_P1'] = train['ID'].apply(lambda x: int(str(x)[:2])) | |
# test['ID_P1'] = test['ID'].apply(lambda x: int(str(x)[:2])) | |
# train['Km/year'] = train['Mileage']/(2021 - train['Prod. year']) | |
# test['Km/year'] = test['Mileage']/(2021 - test['Prod. year']) | |
# train['old'] = 2021 - train['Prod. year'] | |
# test['old'] = 2021 - test['Prod. year'] | |
# train['em'] = train['Engine volume'] * train['Mileage'] | |
# test['em'] = test['Engine volume'] * test['Mileage'] | |
for col in ['Prod. year', 'Fuel type', 'Gear box type', 'Drive wheels', 'Levy', 'Airbags']: | |
train, test = groupby(train, test, col) | |
# train, y_train = drop_outliers(train) | |
train, test = two_col_groupby(train, test, 'Prod. year','Category') | |
train, test = two_col_groupby(train, test, 'Prod. year','Fuel type') | |
return train.drop(['Price'],axis=1), test | |
def enc_cols(train, y_train, test, cols): | |
enc = TargetEncoder(cols = cols) | |
train[cols] = enc.fit_transform(train[cols], y_train) | |
test[cols] = enc.transform(test[cols]) | |
return train, test | |
from sklearn.model_selection import KFold | |
def cross_val(X, y, models, weights, metric = rmlse, folds = 10): | |
scores = [] | |
best_model = None | |
best_sc = 10000 | |
for tr_in, val_in in KFold(n_splits = folds).split(X, y): | |
X_train, y_train, X_val, y_val = X.iloc[tr_in,:], y[tr_in], X.iloc[val_in,:], y[val_in] | |
X_train, X_val = FE(pd.concat([X_train, pd.DataFrame(y_train,columns=['Price'])],axis=1), X_val) | |
cols = X_train.columns[X_train.dtypes=='object'] | |
X_train, X_val = enc_cols(X_train, y_train, X_val, cols) | |
cols = ['ID', 'Levy', 'Manufacturer', 'Model', 'Prod. year', 'Engine volume', | |
'Mileage', 'Cylinders', 'Gear box type', 'Doors', 'Airbags', 'Turbo', | |
'Gear box type_Mean', 'Levy_Min', 'Levy_Mean', 'Levy_Max', | |
'Airbags_Mean', 'Airbags_Max', 'Mean_Prod. year_Category', | |
'Mean_Prod. year_Fuel type'] | |
# X_train = X_train[cols] | |
# X_val = X_val[cols] | |
# id_train, id_test = X_train['ID'].copy(), X_val['ID'].copy() | |
# X_train.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True) | |
# X_val.drop(['Wheel', 'Fuel type_Min', 'Gear box type_Min'],axis=1,inplace=True) | |
y_hat = np.zeros(y_val.shape) | |
for model,weight in zip(models,weights): | |
model.fit(X_train, y_train) | |
y_hat = y_hat + (model.predict(X_val) * weight) | |
X_train['Price'] = y_train | |
X_val['Price'] = y_hat | |
# X_train['ID'], X_val['ID'] = id_train, id_test | |
X_train, X_test = leak(X_train, X_val) | |
sc = metric(y_val,X_val['Price']) | |
print(sc) | |
if sc < best_sc: | |
best_sc = sc | |
best_model = model | |
scores.append(sc) | |
return np.mean(scores), best_model, X_train |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment