Skip to content

Instantly share code, notes, and snippets.

@celik-muhammed
Last active December 9, 2023 00:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save celik-muhammed/91f73a56e654dff47f5111227c020667 to your computer and use it in GitHub Desktop.
Save celik-muhammed/91f73a56e654dff47f5111227c020667 to your computer and use it in GitHub Desktop.
How to Converting Pandas Column of Comma-Separated Strings Into Dummy Variables?
# get_dummies.py
# Save your custom function in a Python script (.py file) then import it
# for to use it with pickle.load().
# This is a common approach to store and reuse custom functions in different scripts or projects.
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
class GetDummies(BaseEstimator, TransformerMixin):
def __init__(self, data_sep=',', col_name_sep='_'):
"""
Transformer that creates dummy variables from categorical columns with a separator.
Parameters:
- data_sep (str): Defaulth ',' separator used to split categorical values into multiple dummy variables.
- col_name_sep (str): Defaulth '_' separator used to separate the column name from the prefix in the output column names.
"""
self.data_sep = data_sep
self.col_name_sep = col_name_sep
self.columns = []
self.dummy_cols = []
self.dummy_prefix = []
# Return self nothing else to do here
def fit(self, X, y = None):
"""
Fit the transformer to the data.
Parameters:
- X (pandas.DataFrame): Input data with categorical columns.
- y (array-like): Target variable (ignored).
Returns:
- self: Returns the transformer object.
"""
object_cols = X.select_dtypes(include="O").columns
self.dummy_cols = [col for col in object_cols if X[col].str.contains(self.data_sep, regex=True).any()]
self.dummy_prefix = [col[:2] if self.col_name_sep not in col else ''.join(map(lambda x: x[0], col.split(self.col_name_sep))) for col in self.dummy_cols]
if len(self.dummy_cols):
# Apply dummy to train data
dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
self.columns = X.join(dummy_frames_df).drop(columns=self.dummy_cols).columns.tolist()
else:
self.columns = X.columns.tolist()
return self
# Transformer method for to return transformed data
def transform(self, X, y = None):
"""
Transform the input data by creating dummy variables.
Parameters:
- X (pandas.DataFrame): Input data with categorical columns.
- y (array-like): Target variable (ignored).
Returns:
- X_transformed (pandas.DataFrame): Transformed data with dummy variables.
"""
if len(self.dummy_prefix) or len(self.dummy_cols):
dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1)
X = X.join(dummy_frames_df).reindex(columns=self.columns, fill_value=0)
return X
# to get feature names
def get_feature_names_out(self, input_features=None):
"""
Get the names of the transformed features.
Parameters:
- input_features (array-like): Names of the input features (ignored).
Returns:
- output_features (list): Names of the transformed features.
"""
return self.columns
def json_normalizer(df, col, train_data, sep=',', pre='ToL_'):
df = df[col].apply(lambda x: {k:1 for k in x.split(sep) if k in train_data})
df = pd.json_normalize(df).fillna(0).astype(int).add_prefix(pre)
return df
# sample
train_data = ['credit-builder loan', 'home equity loan', 'mortgage loan', 'personal loan', 'student loan']
json_normalizer(X_val, 'Type_of_Loan', train_data).head()
# more: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/preprocessing/_encoders.py#L195
from sklearn.preprocessing import OneHotEncoder
#Define an object of class OneHotEncoder
ohe = OneHotEncoder( handle_unknown="ignore", sparse=False )
#Calling methods on our OneHotEncoder ( ohe ) object
ohe.fit( data ) # returns nothing
data_ohe = ohe.transform( data ) # returns transforming results
# get_dummies.py
# Save your custom function in a Python script (.py file) then import it
from get_dummies import GetDummies
dummy = GetDummies()
X_train_dummy = dummy.fit_transform(X_train)
X_val_dummy = dummy.transform(X_val)
X_train_dummy.shape, X_val_dummy.shape
# or pipeline or whatever else you want
pipe_dummy = Pipeline([('GetDummies', GetDummies())])
X_train_dummy = pipe_dummy.fit_transform(X_train)
X_test_dummy = pipe_dummy.fit_transform(X_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment