Created
March 24, 2023 19:06
-
-
Save ceaksan/a95835e619b7c8ae745c51798683904a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import pandas as pd | |
def sanitize_column_names(df: pd.DataFrame, remove_punct: bool = True) -> pd.DataFrame: | |
""" | |
Sanitize DataFrame column names by converting to lowercase and replacing | |
whitespace and punctuation with underscores. | |
Parameters | |
---------- | |
df : pandas.DataFrame | |
DataFrame with non-standardized column names. | |
remove_punct : bool, optional | |
If True, remove all punctuation from column names, by default True. | |
Returns | |
------- | |
pandas.DataFrame | |
DataFrame with standardized column names. | |
Example | |
------- | |
>>> df = pd.DataFrame({ | |
'Column With Spaces': [1,2,3,4,5], | |
'Column-With-Hyphens&Others/': [6,7,8,9,10], | |
'Too Many Spaces': [11,12,13,14,15], | |
}) | |
>>> df = sanitize_column_names(df) | |
>>> print(df.columns) | |
Index(['column_with_spaces', 'column_with_hyphens_others', 'too_many_spaces'], dtype='object') | |
""" | |
# Create translation table to replace punctuation with spaces | |
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) | |
# Generate new column names by converting to lowercase, replacing whitespace with underscores, and removing punctuation | |
new_columns = [re.sub(r'\s+', '_', c.lower().translate(translator)).strip('_') for c in df.columns] | |
new_columns = [re.sub(r'_+', '_', c) for c in new_columns] | |
# Rename columns using a dictionary mapping of old column names to new column names | |
df = df.rename(columns=dict(zip(df.columns, new_columns))) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment