Skip to content

Instantly share code, notes, and snippets.

@ceaksan
Created March 24, 2023 19:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceaksan/a95835e619b7c8ae745c51798683904a to your computer and use it in GitHub Desktop.
Save ceaksan/a95835e619b7c8ae745c51798683904a to your computer and use it in GitHub Desktop.
import re
import string
import pandas as pd
def sanitize_column_names(df: pd.DataFrame, remove_punct: bool = True) -> pd.DataFrame:
"""
Sanitize DataFrame column names by converting to lowercase and replacing
whitespace and punctuation with underscores.
Parameters
----------
df : pandas.DataFrame
DataFrame with non-standardized column names.
remove_punct : bool, optional
If True, remove all punctuation from column names, by default True.
Returns
-------
pandas.DataFrame
DataFrame with standardized column names.
Example
-------
>>> df = pd.DataFrame({
'Column With Spaces': [1,2,3,4,5],
'Column-With-Hyphens&Others/': [6,7,8,9,10],
'Too Many Spaces': [11,12,13,14,15],
})
>>> df = sanitize_column_names(df)
>>> print(df.columns)
Index(['column_with_spaces', 'column_with_hyphens_others', 'too_many_spaces'], dtype='object')
"""
# Create translation table to replace punctuation with spaces
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
# Generate new column names by converting to lowercase, replacing whitespace with underscores, and removing punctuation
new_columns = [re.sub(r'\s+', '_', c.lower().translate(translator)).strip('_') for c in df.columns]
new_columns = [re.sub(r'_+', '_', c) for c in new_columns]
# Rename columns using a dictionary mapping of old column names to new column names
df = df.rename(columns=dict(zip(df.columns, new_columns)))
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment