NimaSarajpoor/All_together.py

## All_together.py
# import stuff!
# just read the code and see what needs to be imported :)

def get_X_and_y(df, target):
    try:
        return df.drop(columns=target), df[target]

    except:
        return df, None
############################################
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

target = "Survived"
X_train, y_train = get_X_and_y(df_train, target)
X_test, y_test = get_X_and_y(df_test, target)

if y_test is None:
    print("Groundtruth for y_test is not provied.")

#############################################
# throwout garbage columns! As a fun exercise, you can
# embed this step in the preprocess pipeline

garbage_cols = ['PassengerId', 'Name', 'Ticket']
X_train.drop(columns=garbage_cols, inplace=True)
X_test.drop(columns=garbage_cols, inplace=True)
##############################################
# pipelines

class extract_first_letter(BaseEstimator, TransformerMixin):
    def __init__(self):
        # you may add a parameter to your transformer
        return

    def fit(self, X, y=None):
        # you may add some check on X
        return self

    def transform(self, X, y=None):
        for i, col in enumerate(X.columns):
            X[col].apply(lambda x: x[0] if isinstance(x, str) else x)

        return X

pipe_sex = Pipeline([
    ('ohe', OneHotEncoder(drop='if_binary'))
])

pipe_age = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True))
])


# pipe_cabin
pipe_cabin = Pipeline([
    ('extract_first_letter', extract_first_letter()), #columns=['Cabin']
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")), # fill_value="missing"
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

pipe_Embarked = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),

])

##############################################
ct_preprocessor = ColumnTransformer([
    ('pipe_sex', pipe_sex, ['Sex']),
    ('pipe_age', pipe_age, ['Age']),
    ('pipe_cabine', pipe_cabin, ['Cabin']),
    ('pipe_Embarked', pipe_Embarked, ['Embarked'])
], remainder='passthrough')


preprocessor = Pipeline([
    ('ct_preprocessor', ct_preprocessor),
    ('imputer', SimpleImputer(strategy="mean")), # to avoid any possible missing value that may appear in test set!
    ('scaler', StandardScaler(with_mean=False)),
])

pipe_precessor = ('preprocessor', preprocessor)
pipe_estimator = ('classifier', RandomForestClassifier())

pipe = Pipeline([
    pipe_precessor,
    pipe_estimator
])

####################################

cv = 5
cv_scores = cross_val_score(
    pipe, X_train, y_train, scoring='accuracy', cv=cv
)

print(f'cv_scores for cv={cv}: \n', cv_scores)

## my_transformer.py
from sklearn.base import BaseEstimator, TransformerMixin

class extract_first_letter(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        for i, col in enumerate(X.columns):
            X[col].apply(lambda x: x[0] if isinstance(x, str) else x)

        return X

## pipelines.py
pipe_sex = Pipeline([
    ('ohe', OneHotEncoder(drop='if_binary'))
])

pipe_age = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True))
])


# pipe_cabin
pipe_cabin = Pipeline([
    ('extract_first_letter', extract_first_letter()), # WHAT?!... (continue reading!)
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])

pipe_Embarked = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),

])

##############################################
ct_preprocessor = ColumnTransformer([
    ('pipe_sex', pipe_sex, ['Sex']),
    ('pipe_age', pipe_age, ['Age']),
    ('pipe_cabine', pipe_cabin, ['Cabin']),
    ('pipe_Embarked', pipe_Embarked, ['Embarked'])
], remainder='passthrough')


preprocessor = Pipeline([
    ('ct_preprocessor', ct_preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
])

pipe_precessor = ('preprocessor', preprocessor)
pipe_estimator = ('classifier', RandomForestClassifier())

pipe = Pipeline([
    pipe_precessor,
    pipe_estimator
])
	# import stuff!
	# just read the code and see what needs to be imported :)

	def get_X_and_y(df, target):
	try:
	return df.drop(columns=target), df[target]

	except:
	return df, None
	############################################
	df_train = pd.read_csv("./train.csv")
	df_test = pd.read_csv("./test.csv")

	target = "Survived"
	X_train, y_train = get_X_and_y(df_train, target)
	X_test, y_test = get_X_and_y(df_test, target)

	if y_test is None:
	print("Groundtruth for y_test is not provied.")

	#############################################
	# throwout garbage columns! As a fun exercise, you can
	# embed this step in the preprocess pipeline

	garbage_cols = ['PassengerId', 'Name', 'Ticket']
	X_train.drop(columns=garbage_cols, inplace=True)
	X_test.drop(columns=garbage_cols, inplace=True)
	##############################################
	# pipelines

	class extract_first_letter(BaseEstimator, TransformerMixin):
	def __init__(self):
	# you may add a parameter to your transformer
	return

	def fit(self, X, y=None):
	# you may add some check on X
	return self

	def transform(self, X, y=None):
	for i, col in enumerate(X.columns):
	X[col].apply(lambda x: x[0] if isinstance(x, str) else x)

	return X

	pipe_sex = Pipeline([
	('ohe', OneHotEncoder(drop='if_binary'))
	])

	pipe_age = Pipeline([
	('imputer', SimpleImputer(strategy='median', add_indicator=True))
	])


	# pipe_cabin
	pipe_cabin = Pipeline([
	('extract_first_letter', extract_first_letter()), #columns=['Cabin']
	('imputer', SimpleImputer(strategy='constant', fill_value="missing")), # fill_value="missing"
	('ohe', OneHotEncoder(handle_unknown='ignore')),
	])

	pipe_Embarked = Pipeline([
	('imputer', SimpleImputer(strategy='most_frequent')),
	('ohe', OneHotEncoder(handle_unknown='ignore')),

	])

	##############################################
	ct_preprocessor = ColumnTransformer([
	('pipe_sex', pipe_sex, ['Sex']),
	('pipe_age', pipe_age, ['Age']),
	('pipe_cabine', pipe_cabin, ['Cabin']),
	('pipe_Embarked', pipe_Embarked, ['Embarked'])
	], remainder='passthrough')


	preprocessor = Pipeline([
	('ct_preprocessor', ct_preprocessor),
	('imputer', SimpleImputer(strategy="mean")), # to avoid any possible missing value that may appear in test set!
	('scaler', StandardScaler(with_mean=False)),
	])

	pipe_precessor = ('preprocessor', preprocessor)
	pipe_estimator = ('classifier', RandomForestClassifier())

	pipe = Pipeline([
	pipe_precessor,
	pipe_estimator
	])

	####################################

	cv = 5
	cv_scores = cross_val_score(
	pipe, X_train, y_train, scoring='accuracy', cv=cv
	)

	print(f'cv_scores for cv={cv}: \n', cv_scores)
	from sklearn.base import BaseEstimator, TransformerMixin

	class extract_first_letter(BaseEstimator, TransformerMixin):
	def __init__(self):
	return

	def fit(self, X, y=None):
	return self


	def transform(self, X, y=None):
	for i, col in enumerate(X.columns):
	X[col].apply(lambda x: x[0] if isinstance(x, str) else x)

	return X