Last active
August 26, 2022 06:41
-
-
Save NimaSarajpoor/7537922048c6be2007810e89642f6906 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import stuff! | |
# just read the code and see what needs to be imported :) | |
def get_X_and_y(df, target): | |
try: | |
return df.drop(columns=target), df[target] | |
except: | |
return df, None | |
############################################ | |
df_train = pd.read_csv("./train.csv") | |
df_test = pd.read_csv("./test.csv") | |
target = "Survived" | |
X_train, y_train = get_X_and_y(df_train, target) | |
X_test, y_test = get_X_and_y(df_test, target) | |
if y_test is None: | |
print("Groundtruth for y_test is not provied.") | |
############################################# | |
# throwout garbage columns! As a fun exercise, you can | |
# embed this step in the preprocess pipeline | |
garbage_cols = ['PassengerId', 'Name', 'Ticket'] | |
X_train.drop(columns=garbage_cols, inplace=True) | |
X_test.drop(columns=garbage_cols, inplace=True) | |
############################################## | |
# pipelines | |
class extract_first_letter(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
# you may add a parameter to your transformer | |
return | |
def fit(self, X, y=None): | |
# you may add some check on X | |
return self | |
def transform(self, X, y=None): | |
for i, col in enumerate(X.columns): | |
X[col].apply(lambda x: x[0] if isinstance(x, str) else x) | |
return X | |
pipe_sex = Pipeline([ | |
('ohe', OneHotEncoder(drop='if_binary')) | |
]) | |
pipe_age = Pipeline([ | |
('imputer', SimpleImputer(strategy='median', add_indicator=True)) | |
]) | |
# pipe_cabin | |
pipe_cabin = Pipeline([ | |
('extract_first_letter', extract_first_letter()), #columns=['Cabin'] | |
('imputer', SimpleImputer(strategy='constant', fill_value="missing")), # fill_value="missing" | |
('ohe', OneHotEncoder(handle_unknown='ignore')), | |
]) | |
pipe_Embarked = Pipeline([ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('ohe', OneHotEncoder(handle_unknown='ignore')), | |
]) | |
############################################## | |
ct_preprocessor = ColumnTransformer([ | |
('pipe_sex', pipe_sex, ['Sex']), | |
('pipe_age', pipe_age, ['Age']), | |
('pipe_cabine', pipe_cabin, ['Cabin']), | |
('pipe_Embarked', pipe_Embarked, ['Embarked']) | |
], remainder='passthrough') | |
preprocessor = Pipeline([ | |
('ct_preprocessor', ct_preprocessor), | |
('imputer', SimpleImputer(strategy="mean")), # to avoid any possible missing value that may appear in test set! | |
('scaler', StandardScaler(with_mean=False)), | |
]) | |
pipe_precessor = ('preprocessor', preprocessor) | |
pipe_estimator = ('classifier', RandomForestClassifier()) | |
pipe = Pipeline([ | |
pipe_precessor, | |
pipe_estimator | |
]) | |
#################################### | |
cv = 5 | |
cv_scores = cross_val_score( | |
pipe, X_train, y_train, scoring='accuracy', cv=cv | |
) | |
print(f'cv_scores for cv={cv}: \n', cv_scores) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
class extract_first_letter(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
return | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
for i, col in enumerate(X.columns): | |
X[col].apply(lambda x: x[0] if isinstance(x, str) else x) | |
return X |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pipe_sex = Pipeline([ | |
('ohe', OneHotEncoder(drop='if_binary')) | |
]) | |
pipe_age = Pipeline([ | |
('imputer', SimpleImputer(strategy='median', add_indicator=True)) | |
]) | |
# pipe_cabin | |
pipe_cabin = Pipeline([ | |
('extract_first_letter', extract_first_letter()), # WHAT?!... (continue reading!) | |
('imputer', SimpleImputer(strategy='constant', fill_value="missing")), | |
('ohe', OneHotEncoder(handle_unknown='ignore')), | |
]) | |
pipe_Embarked = Pipeline([ | |
('imputer', SimpleImputer(strategy='most_frequent')), | |
('ohe', OneHotEncoder(handle_unknown='ignore')), | |
]) | |
############################################## | |
ct_preprocessor = ColumnTransformer([ | |
('pipe_sex', pipe_sex, ['Sex']), | |
('pipe_age', pipe_age, ['Age']), | |
('pipe_cabine', pipe_cabin, ['Cabin']), | |
('pipe_Embarked', pipe_Embarked, ['Embarked']) | |
], remainder='passthrough') | |
preprocessor = Pipeline([ | |
('ct_preprocessor', ct_preprocessor), | |
('scaler', StandardScaler(with_mean=False)), | |
]) | |
pipe_precessor = ('preprocessor', preprocessor) | |
pipe_estimator = ('classifier', RandomForestClassifier()) | |
pipe = Pipeline([ | |
pipe_precessor, | |
pipe_estimator | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
first gist