Skip to content

Instantly share code, notes, and snippets.

@vidit0210
Created February 29, 2020 20:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vidit0210/0076076ee89ed94477c5bb9e43d767b4 to your computer and use it in GitHub Desktop.
Save vidit0210/0076076ee89ed94477c5bb9e43d767b4 to your computer and use it in GitHub Desktop.
DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
input_df=True,
df_out=True
)
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
[(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
input_df=True,
df_out=True
)
-----
Kidney disease case study II: Feature Union
-----
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
("num_mapper", numeric_imputation_mapper),
("cat_mapper", categorical_imputation_mapper)
])
------
Kidney disease case study III: Full pipeline
------
# Create full pipeline
pipeline = Pipeline([
("featureunion", numeric_categorical_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))
])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))
-------
Bringing it all together
---------
# Create the parameter grid
gbm_param_grid = {
'clf__learning_rate': np.arange(.05, 1, .05),
'clf__max_depth': np.arange(3,10, 1),
'clf__n_estimators': np.arange(50, 200, 50)
}
# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,
param_distributions=gbm_param_grid,
n_iter=2, scoring='roc_auc', cv=2, verbose=1)
# Fit the estimator
randomized_roc_auc.fit(X, y)
# Compute metrics
print(randomized_roc_auc.best_score_)
print(randomized_roc_auc.best_estimator_)
------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment