celik-muhammed/GetDummies.py

## GetDummies.py
# get_dummies.py
# Save your custom function in a Python script (.py file) then import it
# for to use it with pickle.load().
# This is a common approach to store and reuse custom functions in different scripts or projects.
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, data_sep=',', col_name_sep='_'):
        """
        Transformer that creates dummy variables from categorical columns with a separator.
        Parameters:
            - data_sep (str): Defaulth ',' separator used to split categorical values into multiple dummy variables.
            - col_name_sep (str): Defaulth '_' separator used to separate the column name from the prefix in the output column names.
        """
        self.data_sep     = data_sep
        self.col_name_sep = col_name_sep
        self.columns      = []
        self.dummy_cols   = []
        self.dummy_prefix = []

    # Return self nothing else to do here
    def fit(self, X, y  = None):
        """
        Fit the transformer to the data.
        Parameters:
            - X (pandas.DataFrame): Input data with categorical columns.
            - y (array-like): Target variable (ignored).
        Returns:
            - self: Returns the transformer object.
        """
        object_cols       = X.select_dtypes(include="O").columns
        self.dummy_cols   = [col for col in object_cols if X[col].str.contains(self.data_sep, regex=True).any()]
        self.dummy_prefix = [col[:2] if self.col_name_sep not in col else ''.join(map(lambda x: x[0], col.split(self.col_name_sep))) for col in self.dummy_cols]

        if len(self.dummy_cols):
            # Apply dummy to train data
            dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
            self.columns    = X.join(dummy_frames_df).drop(columns=self.dummy_cols).columns.tolist()
        else:
            self.columns = X.columns.tolist()
        return self

    # Transformer method for to return transformed data
    def transform(self, X, y = None):
        """
        Transform the input data by creating dummy variables.
        Parameters:
            - X (pandas.DataFrame): Input data with categorical columns.
            - y (array-like): Target variable (ignored).
        Returns:
            - X_transformed (pandas.DataFrame): Transformed data with dummy variables.
        """
        if len(self.dummy_prefix) or len(self.dummy_cols):
            dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
            X = X.join(dummy_frames_df).reindex(columns=self.columns, fill_value=0)
        return X

    # to get feature names
    def get_feature_names_out(self, input_features=None):
        """
        Get the names of the transformed features.
        Parameters:
            - input_features (array-like): Names of the input features (ignored).
        Returns:
            - output_features (list): Names of the transformed features.
        """
        return self.columns

## json_normalizer.py
def json_normalizer(df, col, train_data, sep=',', pre='ToL_'):
    df = df[col].apply(lambda x: {k:1 for k in x.split(sep) if k in train_data})
    df = pd.json_normalize(df).fillna(0).astype(int).add_prefix(pre)
    return df

# sample
train_data = ['credit-builder loan', 'home equity loan', 'mortgage loan', 'personal loan', 'student loan']
json_normalizer(X_val, 'Type_of_Loan', train_data).head()

## sample_syntax_GetDummies.py
# more: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/preprocessing/_encoders.py#L195
from sklearn.preprocessing import OneHotEncoder

#Define an object of class OneHotEncoder
ohe = OneHotEncoder( handle_unknown="ignore", sparse=False )

#Calling methods on our OneHotEncoder ( ohe ) object
ohe.fit( data ) # returns nothing
data_ohe = ohe.transform( data ) # returns transforming results

## sample_use_GetDummies.py
# get_dummies.py
# Save your custom function in a Python script (.py file) then import it
from get_dummies import GetDummies

dummy         = GetDummies()
X_train_dummy = dummy.fit_transform(X_train)
X_val_dummy   = dummy.transform(X_val)

X_train_dummy.shape, X_val_dummy.shape


# or pipeline or whatever else you want
pipe_dummy    = Pipeline([('GetDummies', GetDummies())])
X_train_dummy = pipe_dummy.fit_transform(X_train)
X_test_dummy  = pipe_dummy.fit_transform(X_test)
	# get_dummies.py
	# Save your custom function in a Python script (.py file) then import it
	# for to use it with pickle.load().
	# This is a common approach to store and reuse custom functions in different scripts or projects.
	import numpy as np
	import pandas as pd
	from sklearn.base import BaseEstimator, TransformerMixin

	class GetDummies(BaseEstimator, TransformerMixin):
	def __init__(self, data_sep=',', col_name_sep='_'):
	"""
	Transformer that creates dummy variables from categorical columns with a separator.
	Parameters:
	- data_sep (str): Defaulth ',' separator used to split categorical values into multiple dummy variables.
	- col_name_sep (str): Defaulth '_' separator used to separate the column name from the prefix in the output column names.
	"""
	self.data_sep = data_sep
	self.col_name_sep = col_name_sep
	self.columns = []
	self.dummy_cols = []
	self.dummy_prefix = []

	# Return self nothing else to do here
	def fit(self, X, y = None):
	"""
	Fit the transformer to the data.
	Parameters:
	- X (pandas.DataFrame): Input data with categorical columns.
	- y (array-like): Target variable (ignored).
	Returns:
	- self: Returns the transformer object.
	"""
	object_cols = X.select_dtypes(include="O").columns
	self.dummy_cols = [col for col in object_cols if X[col].str.contains(self.data_sep, regex=True).any()]
	self.dummy_prefix = [col[:2] if self.col_name_sep not in col else ''.join(map(lambda x: x[0], col.split(self.col_name_sep))) for col in self.dummy_cols]

	if len(self.dummy_cols):
	# Apply dummy to train data
	dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
	self.columns = X.join(dummy_frames_df).drop(columns=self.dummy_cols).columns.tolist()
	else:
	self.columns = X.columns.tolist()
	return self

	# Transformer method for to return transformed data
	def transform(self, X, y = None):
	"""
	Transform the input data by creating dummy variables.
	Parameters:
	- X (pandas.DataFrame): Input data with categorical columns.
	- y (array-like): Target variable (ignored).
	Returns:
	- X_transformed (pandas.DataFrame): Transformed data with dummy variables.
	"""
	if len(self.dummy_prefix) or len(self.dummy_cols):
	dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
	X = X.join(dummy_frames_df).reindex(columns=self.columns, fill_value=0)
	return X

	# to get feature names
	def get_feature_names_out(self, input_features=None):
	"""
	Get the names of the transformed features.
	Parameters:
	- input_features (array-like): Names of the input features (ignored).
	Returns:
	- output_features (list): Names of the transformed features.
	"""
	return self.columns
	def json_normalizer(df, col, train_data, sep=',', pre='ToL_'):
	df = df[col].apply(lambda x: {k:1 for k in x.split(sep) if k in train_data})
	df = pd.json_normalize(df).fillna(0).astype(int).add_prefix(pre)
	return df

	# sample
	train_data = ['credit-builder loan', 'home equity loan', 'mortgage loan', 'personal loan', 'student loan']
	json_normalizer(X_val, 'Type_of_Loan', train_data).head()
	# more: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/preprocessing/_encoders.py#L195
	from sklearn.preprocessing import OneHotEncoder

	#Define an object of class OneHotEncoder
	ohe = OneHotEncoder( handle_unknown="ignore", sparse=False )

	#Calling methods on our OneHotEncoder ( ohe ) object
	ohe.fit( data ) # returns nothing
	data_ohe = ohe.transform( data ) # returns transforming results
	# get_dummies.py
	# Save your custom function in a Python script (.py file) then import it
	from get_dummies import GetDummies

	dummy = GetDummies()
	X_train_dummy = dummy.fit_transform(X_train)
	X_val_dummy = dummy.transform(X_val)

	X_train_dummy.shape, X_val_dummy.shape


	# or pipeline or whatever else you want
	pipe_dummy = Pipeline([('GetDummies', GetDummies())])
	X_train_dummy = pipe_dummy.fit_transform(X_train)
	X_test_dummy = pipe_dummy.fit_transform(X_test)