Nov05/2019-10-20 random forest.md

## 2019-10-20 random forest.md

      
    Raw
  

              2019-10-20 random forest.md
            
          
    Baseline

# baseline score
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.33, random_state=42)
print("train mean velocity:", y_train.mean())
y_val_pred = [y_train.mean()] * len(y_val)
print('baseline error score:', mean_squared_error(y_val, y_val_pred))

Random Forest

%%time
numeric_features = features
lowcardi_features = []
highcardi_features = []

###############################################
# Pipeline preprocessor
###############################################
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', ce.OneHotEncoder(drop_invariant=True, use_cat_names=True))]) 
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', ce.OrdinalEncoder())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('one', onehot_transformer, lowcardi_features), # categorical_features
        ('ord', ordinal_transformer, highcardi_features), # categorical_features
    ])
###############################################
# Random Forest with Cross Validation
###############################################
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('rf', RandomForestRegressor(n_estimators=10, 
                                                        max_depth=20,
                                                        random_state=5, 
                                                        n_jobs=-1, 
                                                        oob_score=True,)),
                          ])
params = {
    'rf__n_estimators': [10, 15, 20],
    'rf__max_depth': [1, 2, 3, 5],
#     'rf__max_features': ['auto', 'sqrt', 'log2'],
#     'rf__criterion': ['mse', 'mae']
    }
search = GridSearchCV(
    pipeline, 
    params, 
    return_train_score=True,
    cv=5)
search.fit(X_train[features], y_train)

Best params and prediction

print('Best hyperparameters', search.best_params_)
print('Cross-validation best score', search.best_score_)
y_val_pred = search.predict(X_val[features])
print('Random Forest error score:', mean_squared_error(y_val, y_val_pred))

Feature importance

###############################################
# eli5
###############################################
pipeline.fit(X_train[features], y_train)

permuter = PermutationImportance(pipeline['rf'], 
                                 scoring='neg_mean_squared_error', 
                                 cv='prefit', 
                                 n_iter=2, 
                                 random_state=5)
permuter.fit(pipeline['preprocessor'].transform(X_val[features]), y_val)

# Transformed features
tfs = pipeline['preprocessor'].named_transformers_
features_transformed = numeric_features
eli5.show_weights(permuter, top=None, feature_names=features_transformed)

Go to the repo

https://github.com/Nov05/playground-fireball