Skip to content

Instantly share code, notes, and snippets.

@JarrydWannenburg
Last active August 26, 2022 18:44
Show Gist options
  • Save JarrydWannenburg/2996ecb99ec824f065e7688c7742aa81 to your computer and use it in GitHub Desktop.
Save JarrydWannenburg/2996ecb99ec824f065e7688c7742aa81 to your computer and use it in GitHub Desktop.
Scalable_Pipeline_Article
class CategoricalTransformer(BaseEstimator, TransformerMixin):
def __init__(self, sep_passengerId = True, sep_cabin = True, convert_bools = True, excluded_features = ['Name']):
self.sep_passengerId = sep_passengerId
self.sep_cabin = sep_cabin
self.convert_bools = convert_bools
self.excluded_features = excluded_features
def fit(self, X, y=None):
return self
def get_deck(self, obj):
return str(obj.split('/')[0])
def get_num(self, obj):
return str(obj.split('/')[1])
def get_side(self, obj):
return str(obj.split('/')[2])
def get_group(self,obj):
return str(obj.split('_')[0])
def get_passenger(self, obj):
return str(obj.split('_')[1])
def get_dummy(self,obj):
if obj == True:
return 'Yes'
elif obj ==False:
return 'No'
else:
return np.nan
def transform(self, X, y=None):
if len(self.excluded_features) > 0:
X.drop(columns=self.excluded_features, inplace=True)
if self.sep_cabin:
X.Cabin.fillna('NONE/NONE/NONE', inplace=True)
X['Deck'] = X.loc[:, 'Cabin'].apply(self.get_deck)
X.Deck.replace({'NONE':np.nan}, inplace=True)
X['Deck2'] = X.Deck
X['Num'] = X.loc[:, 'Cabin'].apply(self.get_num)
X.Num.replace({'NONE':np.nan}, inplace=True)
X['Side'] = X.loc[:, 'Cabin'].apply(self.get_side)
X.Side.replace({'NONE':np.nan}, inplace=True)
X.drop(columns=['Cabin', 'Num'], inplace=True)
if self.sep_passengerId:
X.PassengerId.fillna("NONE_NONE", inplace=True)
X['Group'] = X.loc[:, 'PassengerId'].apply(self.get_group)
X.Group.replace({'NONE':np.nan}, inplace=True)
X['Passenger'] = X.loc[:, 'PassengerId'].apply(self.get_passenger)
X.Passenger.replace({'NONE':np.nan}, inplace=True)
X.drop(columns=['PassengerId'], inplace=True)
if self.convert_bools:
X['CryoSleep_Yes'] = X.loc[:, 'CryoSleep'].apply(self.get_dummy)
X['VIP_Yes'] = X.loc[:, 'VIP'].apply(self.get_dummy)
for col in X.columns.to_list():
X[col].fillna(X[col].mode()[0], inplace=True)
return X
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment