Last active
December 9, 2023 00:20
-
-
Save celik-muhammed/91f73a56e654dff47f5111227c020667 to your computer and use it in GitHub Desktop.
How to Converting Pandas Column of Comma-Separated Strings Into Dummy Variables?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get_dummies.py | |
# Save your custom function in a Python script (.py file) then import it | |
# for to use it with pickle.load(). | |
# This is a common approach to store and reuse custom functions in different scripts or projects. | |
import numpy as np | |
import pandas as pd | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class GetDummies(BaseEstimator, TransformerMixin): | |
def __init__(self, data_sep=',', col_name_sep='_'): | |
""" | |
Transformer that creates dummy variables from categorical columns with a separator. | |
Parameters: | |
- data_sep (str): Defaulth ',' separator used to split categorical values into multiple dummy variables. | |
- col_name_sep (str): Defaulth '_' separator used to separate the column name from the prefix in the output column names. | |
""" | |
self.data_sep = data_sep | |
self.col_name_sep = col_name_sep | |
self.columns = [] | |
self.dummy_cols = [] | |
self.dummy_prefix = [] | |
# Return self nothing else to do here | |
def fit(self, X, y = None): | |
""" | |
Fit the transformer to the data. | |
Parameters: | |
- X (pandas.DataFrame): Input data with categorical columns. | |
- y (array-like): Target variable (ignored). | |
Returns: | |
- self: Returns the transformer object. | |
""" | |
object_cols = X.select_dtypes(include="O").columns | |
self.dummy_cols = [col for col in object_cols if X[col].str.contains(self.data_sep, regex=True).any()] | |
self.dummy_prefix = [col[:2] if self.col_name_sep not in col else ''.join(map(lambda x: x[0], col.split(self.col_name_sep))) for col in self.dummy_cols] | |
if len(self.dummy_cols): | |
# Apply dummy to train data | |
dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1) | |
self.columns = X.join(dummy_frames_df).drop(columns=self.dummy_cols).columns.tolist() | |
else: | |
self.columns = X.columns.tolist() | |
return self | |
# Transformer method for to return transformed data | |
def transform(self, X, y = None): | |
""" | |
Transform the input data by creating dummy variables. | |
Parameters: | |
- X (pandas.DataFrame): Input data with categorical columns. | |
- y (array-like): Target variable (ignored). | |
Returns: | |
- X_transformed (pandas.DataFrame): Transformed data with dummy variables. | |
""" | |
if len(self.dummy_prefix) or len(self.dummy_cols): | |
dummy_frames_df = pd.concat([X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep) for pre, col in zip(self.dummy_prefix, self.dummy_cols)], axis=1) | |
X = X.join(dummy_frames_df).reindex(columns=self.columns, fill_value=0) | |
return X | |
# to get feature names | |
def get_feature_names_out(self, input_features=None): | |
""" | |
Get the names of the transformed features. | |
Parameters: | |
- input_features (array-like): Names of the input features (ignored). | |
Returns: | |
- output_features (list): Names of the transformed features. | |
""" | |
return self.columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def json_normalizer(df, col, train_data, sep=',', pre='ToL_'): | |
df = df[col].apply(lambda x: {k:1 for k in x.split(sep) if k in train_data}) | |
df = pd.json_normalize(df).fillna(0).astype(int).add_prefix(pre) | |
return df | |
# sample | |
train_data = ['credit-builder loan', 'home equity loan', 'mortgage loan', 'personal loan', 'student loan'] | |
json_normalizer(X_val, 'Type_of_Loan', train_data).head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# more: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/preprocessing/_encoders.py#L195 | |
from sklearn.preprocessing import OneHotEncoder | |
#Define an object of class OneHotEncoder | |
ohe = OneHotEncoder( handle_unknown="ignore", sparse=False ) | |
#Calling methods on our OneHotEncoder ( ohe ) object | |
ohe.fit( data ) # returns nothing | |
data_ohe = ohe.transform( data ) # returns transforming results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get_dummies.py | |
# Save your custom function in a Python script (.py file) then import it | |
from get_dummies import GetDummies | |
dummy = GetDummies() | |
X_train_dummy = dummy.fit_transform(X_train) | |
X_val_dummy = dummy.transform(X_val) | |
X_train_dummy.shape, X_val_dummy.shape | |
# or pipeline or whatever else you want | |
pipe_dummy = Pipeline([('GetDummies', GetDummies())]) | |
X_train_dummy = pipe_dummy.fit_transform(X_train) | |
X_test_dummy = pipe_dummy.fit_transform(X_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment