ceaksan/sanitize_column_names.py

## sanitize_column_names.py
import re
import string
import pandas as pd

def sanitize_column_names(df: pd.DataFrame, remove_punct: bool = True) -> pd.DataFrame:

    """
    Sanitize DataFrame column names by converting to lowercase and replacing
    whitespace and punctuation with underscores.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with non-standardized column names.
    remove_punct : bool, optional
        If True, remove all punctuation from column names, by default True.

    Returns
    -------
    pandas.DataFrame
        DataFrame with standardized column names.

    Example
    -------
    >>> df = pd.DataFrame({
            'Column With Spaces': [1,2,3,4,5],
            'Column-With-Hyphens&Others/': [6,7,8,9,10],
            'Too    Many Spaces': [11,12,13,14,15],
        })
    >>> df = sanitize_column_names(df)
    >>> print(df.columns)
    Index(['column_with_spaces', 'column_with_hyphens_others', 'too_many_spaces'], dtype='object')
    """

    # Create translation table to replace punctuation with spaces
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

    # Generate new column names by converting to lowercase, replacing whitespace with underscores, and removing punctuation
    new_columns = [re.sub(r'\s+', '_', c.lower().translate(translator)).strip('_') for c in df.columns]
    new_columns = [re.sub(r'_+', '_', c) for c in new_columns]

    # Rename columns using a dictionary mapping of old column names to new column names
    df = df.rename(columns=dict(zip(df.columns, new_columns)))

    return df
	import re
	import string
	import pandas as pd

	def sanitize_column_names(df: pd.DataFrame, remove_punct: bool = True) -> pd.DataFrame:

	"""
	Sanitize DataFrame column names by converting to lowercase and replacing
	whitespace and punctuation with underscores.

	Parameters
	----------
	df : pandas.DataFrame
	DataFrame with non-standardized column names.
	remove_punct : bool, optional
	If True, remove all punctuation from column names, by default True.

	Returns
	-------
	pandas.DataFrame
	DataFrame with standardized column names.

	Example
	-------
	>>> df = pd.DataFrame({
	'Column With Spaces': [1,2,3,4,5],
	'Column-With-Hyphens&Others/': [6,7,8,9,10],
	'Too Many Spaces': [11,12,13,14,15],
	})
	>>> df = sanitize_column_names(df)
	>>> print(df.columns)
	Index(['column_with_spaces', 'column_with_hyphens_others', 'too_many_spaces'], dtype='object')
	"""

	# Create translation table to replace punctuation with spaces
	translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

	# Generate new column names by converting to lowercase, replacing whitespace with underscores, and removing punctuation
	new_columns = [re.sub(r'\s+', '_', c.lower().translate(translator)).strip('_') for c in df.columns]
	new_columns = [re.sub(r'_+', '_', c) for c in new_columns]

	# Rename columns using a dictionary mapping of old column names to new column names
	df = df.rename(columns=dict(zip(df.columns, new_columns)))

	return df