Skip to content

Instantly share code, notes, and snippets.

@NimaSarajpoor
Last active August 26, 2022 06:41
Show Gist options
  • Save NimaSarajpoor/7537922048c6be2007810e89642f6906 to your computer and use it in GitHub Desktop.
Save NimaSarajpoor/7537922048c6be2007810e89642f6906 to your computer and use it in GitHub Desktop.
# import stuff!
# just read the code and see what needs to be imported :)
def get_X_and_y(df, target):
try:
return df.drop(columns=target), df[target]
except:
return df, None
############################################
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
target = "Survived"
X_train, y_train = get_X_and_y(df_train, target)
X_test, y_test = get_X_and_y(df_test, target)
if y_test is None:
print("Groundtruth for y_test is not provied.")
#############################################
# throwout garbage columns! As a fun exercise, you can
# embed this step in the preprocess pipeline
garbage_cols = ['PassengerId', 'Name', 'Ticket']
X_train.drop(columns=garbage_cols, inplace=True)
X_test.drop(columns=garbage_cols, inplace=True)
##############################################
# pipelines
class extract_first_letter(BaseEstimator, TransformerMixin):
def __init__(self):
# you may add a parameter to your transformer
return
def fit(self, X, y=None):
# you may add some check on X
return self
def transform(self, X, y=None):
for i, col in enumerate(X.columns):
X[col].apply(lambda x: x[0] if isinstance(x, str) else x)
return X
pipe_sex = Pipeline([
('ohe', OneHotEncoder(drop='if_binary'))
])
pipe_age = Pipeline([
('imputer', SimpleImputer(strategy='median', add_indicator=True))
])
# pipe_cabin
pipe_cabin = Pipeline([
('extract_first_letter', extract_first_letter()), #columns=['Cabin']
('imputer', SimpleImputer(strategy='constant', fill_value="missing")), # fill_value="missing"
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
pipe_Embarked = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
##############################################
ct_preprocessor = ColumnTransformer([
('pipe_sex', pipe_sex, ['Sex']),
('pipe_age', pipe_age, ['Age']),
('pipe_cabine', pipe_cabin, ['Cabin']),
('pipe_Embarked', pipe_Embarked, ['Embarked'])
], remainder='passthrough')
preprocessor = Pipeline([
('ct_preprocessor', ct_preprocessor),
('imputer', SimpleImputer(strategy="mean")), # to avoid any possible missing value that may appear in test set!
('scaler', StandardScaler(with_mean=False)),
])
pipe_precessor = ('preprocessor', preprocessor)
pipe_estimator = ('classifier', RandomForestClassifier())
pipe = Pipeline([
pipe_precessor,
pipe_estimator
])
####################################
cv = 5
cv_scores = cross_val_score(
pipe, X_train, y_train, scoring='accuracy', cv=cv
)
print(f'cv_scores for cv={cv}: \n', cv_scores)
from sklearn.base import BaseEstimator, TransformerMixin
class extract_first_letter(BaseEstimator, TransformerMixin):
def __init__(self):
return
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
for i, col in enumerate(X.columns):
X[col].apply(lambda x: x[0] if isinstance(x, str) else x)
return X
pipe_sex = Pipeline([
('ohe', OneHotEncoder(drop='if_binary'))
])
pipe_age = Pipeline([
('imputer', SimpleImputer(strategy='median', add_indicator=True))
])
# pipe_cabin
pipe_cabin = Pipeline([
('extract_first_letter', extract_first_letter()), # WHAT?!... (continue reading!)
('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
pipe_Embarked = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
##############################################
ct_preprocessor = ColumnTransformer([
('pipe_sex', pipe_sex, ['Sex']),
('pipe_age', pipe_age, ['Age']),
('pipe_cabine', pipe_cabin, ['Cabin']),
('pipe_Embarked', pipe_Embarked, ['Embarked'])
], remainder='passthrough')
preprocessor = Pipeline([
('ct_preprocessor', ct_preprocessor),
('scaler', StandardScaler(with_mean=False)),
])
pipe_precessor = ('preprocessor', preprocessor)
pipe_estimator = ('classifier', RandomForestClassifier())
pipe = Pipeline([
pipe_precessor,
pipe_estimator
])
@NimaSarajpoor
Copy link
Author

first gist

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment