malcolmgreaves/pandas_required_columns.py

## pandas_required_columns.py
from abc import ABC
from dataclasses import dataclass
from typing import List, NamedTuple, Sequence, Type, TypeVar

import pandas as pd


__all__: Sequence[str] = (
    # main abstraction & utilities for columns required in a dataframe
    "Columns",
    "columns",
    "raise_value_error_if_missing_columns",
    "ColumnSchema",
    "missing_columns",
)


@dataclass(frozen=True)
class Columns(ABC):
    """Used for a function to explicitly declare the columns it requires to use in input dataframes.

    ## Motivation
    Consider two dataframes that describe similiar data, but come from different systems. Their column naming schema may not
    align with one another. In this case, it's important to be able to write code that is agnostic to the specific, actual names
    of the columns. Rather, the code should refer to each column conceptually, and then use a mapping from concept to actual
    column name for each of the specific data sources.

    If we write code like:

         def failed_only(df: pd.DataFrame) -> pd.DataFrame:
            return df['status'] == 'failed'

    Then we can't use it if one of our dataframes names this column e.g. `"outcome"`. To work around this, we could write:

         def failed_only(df: pd.DataFrame, col_status: str = 'status') -> pd.DataFrame:
            return df[col_status] == 'failed'

    And use `failed_only(df0, 'status')` and `failed_only(df1, "outcome")` for our two different dataframes.

    However, we have to repeat ourselves on every function that needs to refer to our `status` column: copying-and-pasting both
    the column name in our code (`col_status`) and our default value for it (`"status"`). This quickly becomes unmanageable when
    we get to supporting sets of columns: it turns our code into a keyword-argument soup! It also means that we need to keep
    repeating documentation of what the columns mean.

    Thus, it would be better if we could refer to a group of columns together. This is precisely what a `Columns` class is!

    ## Use
    One must extend this class and define fields with default values that correspond to columns in a dataframe. The field name
    is the conceptual name of the column -- it's stable and can be used in code. The field value is the actual column name as
    it appears in data (i.e. a `pd.DataFrame`).

    NOTE: Extending classes **MUST** only have fields that correspond to column names!

    ## Example
    Going back to our `failed_only` function example, we would refactor it to use `Columns` as:

        class Status(Columns):
            status: str = 'status'

        def failed_only(df: pd.DataFrame, cols: Status = Status()) -> pd.DataFrame:
            return df[cols.status] == 'failed'

    And thus we'd use this as:

        df0: pd.DataFrame = ...
        df1: pd.DataFrame = ...

        failed_only(df0)
        failed_only(df1, cols=Status(status='outcome'))

    If we had a dataframe with another column:

        class IsMain(Columns):
            is_main: str = 'is_main'

    Then we can use multiple-inheritence to easily create a group of required columns without needing
    to re-define the columns nor their default values:

        class Group(IsMain, Status):
            pass

    And use it in a function:

        def successful_in_group(df: pd.DataFrame, cols: Group = Group()) -> pd.DataFrame:
            idx = df[cols.is_main]
            return df[idx, cols.status] == 'success'
    """


Minimum = TypeVar("Minimum", bound=Columns)
"""A subset of `Specific`'s set of required columns.
"""

Specific = TypeVar("Specific", bound=Columns)
"""A superset of `Minimum`'s set of required columns.

NOTE: The `bound` should be `Minimum`. However, mypy does not support this yet.
      https://github.com/python/typing/issues/548
"""


def columns(contract: Type[Minimum], cols: Specific) -> List[str]:
    """Obtains the actual column names from `cols` using `contract` to control the subset of selected columns.

    The `contract` makes it possible for `cols` to have more columns than are necessary
    for a particular application.
    """
    if not isinstance(cols, contract):
        raise TypeError(f"Expecting {type(cols)} to be a subtype of {contract}")
    return [getattr(cols, c) for c in contract.__dataclass_fields__.keys()]


def raise_value_error_if_missing_columns(
    contract: Type[Minimum], cols: Specific, df: pd.DataFrame
) -> None:
    """Raises an error if the dataframe is missing any required columns.

    Uses `missing_columns` (see its docs for details on `contract` and `cols`) to determine
    if the input Pandas DataFrame (`df`) is missing any columns from the `contract`.

    If so, the resulting `ValueError`'s message will contain the number of missing columns, the specific missing
    columns (their schema name and their expected dataframe name), and the dataframe's actual columns.

    Here's an prototypical example:

       @dataclass(frozen=True)
       class MySchema(Columns):
         ...

       def foo(df: pd.DataFrame, cols: MySchema = MySchema()):

          raise_value_error_if_missing_columns(MySchema, cols, df)

          # Guarenteed after this point that `df` has all of the columns defined in `MySchema`.
          # Additionally, guarenteed that `cols` is a superset of the columns in `MySchema`.

          ...
    """
    validation_check = missing_columns(contract, cols, df)
    if len(validation_check) > 0:
        missing_msg = "\n".join([f"- missing '{col}' (set as {field})" for field, col in validation_check])
        raise ValueError(
            f"Missing {len(validation_check)} required columns from dataframe:\n"
            f"{missing_msg}\n"
            f"Dataframe columns ({len(df.columns)}): {df.columns}"
        )


class ColumnSchema(NamedTuple):
    """A reference to a column: maps a name to use for a column in code to the actual column name as it appears in data."""

    column_generic_name: str
    """The name of a field of some `Columns`-extending `class`.
    """

    actual_column_name: str
    """The actual column name in a Pandas DataFrame.
    """


def missing_columns(contract: Type[Minimum], cols: Specific, df: pd.DataFrame) -> List[ColumnSchema]:
    """Returns a list of columns missing from the dataframe.

    The required columns are specified by `contract`, while the actual column value names are in `cols`.

    In the returned list, the
    """
    if not isinstance(cols, contract):
        raise TypeError(f"Expecting {type(cols)} to be a subtype of {contract}")
    return [
        ColumnSchema(column_generic_name, getattr(cols, column_generic_name))
        for column_generic_name in contract.__dataclass_fields__.keys()
        if getattr(cols, column_generic_name) not in df.columns
    ]
	from abc import ABC
	from dataclasses import dataclass
	from typing import List, NamedTuple, Sequence, Type, TypeVar

	import pandas as pd


	__all__: Sequence[str] = (
	# main abstraction & utilities for columns required in a dataframe
	"Columns",
	"columns",
	"raise_value_error_if_missing_columns",
	"ColumnSchema",
	"missing_columns",
	)


	@dataclass(frozen=True)
	class Columns(ABC):
	"""Used for a function to explicitly declare the columns it requires to use in input dataframes.

	## Motivation
	Consider two dataframes that describe similiar data, but come from different systems. Their column naming schema may not
	align with one another. In this case, it's important to be able to write code that is agnostic to the specific, actual names
	of the columns. Rather, the code should refer to each column conceptually, and then use a mapping from concept to actual
	column name for each of the specific data sources.

	If we write code like:

	def failed_only(df: pd.DataFrame) -> pd.DataFrame:
	return df['status'] == 'failed'

	Then we can't use it if one of our dataframes names this column e.g. `"outcome"`. To work around this, we could write:

	def failed_only(df: pd.DataFrame, col_status: str = 'status') -> pd.DataFrame:
	return df[col_status] == 'failed'

	And use `failed_only(df0, 'status')` and `failed_only(df1, "outcome")` for our two different dataframes.

	However, we have to repeat ourselves on every function that needs to refer to our `status` column: copying-and-pasting both
	the column name in our code (`col_status`) and our default value for it (`"status"`). This quickly becomes unmanageable when
	we get to supporting sets of columns: it turns our code into a keyword-argument soup! It also means that we need to keep
	repeating documentation of what the columns mean.

	Thus, it would be better if we could refer to a group of columns together. This is precisely what a `Columns` class is!

	## Use
	One must extend this class and define fields with default values that correspond to columns in a dataframe. The field name
	is the conceptual name of the column -- it's stable and can be used in code. The field value is the actual column name as
	it appears in data (i.e. a `pd.DataFrame`).

	NOTE: Extending classes MUST only have fields that correspond to column names!

	## Example
	Going back to our `failed_only` function example, we would refactor it to use `Columns` as:

	class Status(Columns):
	status: str = 'status'

	def failed_only(df: pd.DataFrame, cols: Status = Status()) -> pd.DataFrame:
	return df[cols.status] == 'failed'

	And thus we'd use this as:

	df0: pd.DataFrame = ...
	df1: pd.DataFrame = ...

	failed_only(df0)
	failed_only(df1, cols=Status(status='outcome'))

	If we had a dataframe with another column:

	class IsMain(Columns):
	is_main: str = 'is_main'

	Then we can use multiple-inheritence to easily create a group of required columns without needing
	to re-define the columns nor their default values:

	class Group(IsMain, Status):
	pass

	And use it in a function:

	def successful_in_group(df: pd.DataFrame, cols: Group = Group()) -> pd.DataFrame:
	idx = df[cols.is_main]
	return df[idx, cols.status] == 'success'
	"""


	Minimum = TypeVar("Minimum", bound=Columns)
	"""A subset of `Specific`'s set of required columns.
	"""

	Specific = TypeVar("Specific", bound=Columns)
	"""A superset of `Minimum`'s set of required columns.

	NOTE: The `bound` should be `Minimum`. However, mypy does not support this yet.
	https://github.com/python/typing/issues/548
	"""


	def columns(contract: Type[Minimum], cols: Specific) -> List[str]:
	"""Obtains the actual column names from `cols` using `contract` to control the subset of selected columns.

	The `contract` makes it possible for `cols` to have more columns than are necessary
	for a particular application.
	"""
	if not isinstance(cols, contract):
	raise TypeError(f"Expecting {type(cols)} to be a subtype of {contract}")
	return [getattr(cols, c) for c in contract.__dataclass_fields__.keys()]


	def raise_value_error_if_missing_columns(
	contract: Type[Minimum], cols: Specific, df: pd.DataFrame
	) -> None:
	"""Raises an error if the dataframe is missing any required columns.

	Uses `missing_columns` (see its docs for details on `contract` and `cols`) to determine
	if the input Pandas DataFrame (`df`) is missing any columns from the `contract`.

	If so, the resulting `ValueError`'s message will contain the number of missing columns, the specific missing
	columns (their schema name and their expected dataframe name), and the dataframe's actual columns.

	Here's an prototypical example:

	@dataclass(frozen=True)
	class MySchema(Columns):
	...

	def foo(df: pd.DataFrame, cols: MySchema = MySchema()):

	raise_value_error_if_missing_columns(MySchema, cols, df)

	# Guarenteed after this point that `df` has all of the columns defined in `MySchema`.
	# Additionally, guarenteed that `cols` is a superset of the columns in `MySchema`.

	...
	"""
	validation_check = missing_columns(contract, cols, df)
	if len(validation_check) > 0:
	missing_msg = "\n".join([f"- missing '{col}' (set as {field})" for field, col in validation_check])
	raise ValueError(
	f"Missing {len(validation_check)} required columns from dataframe:\n"
	f"{missing_msg}\n"
	f"Dataframe columns ({len(df.columns)}): {df.columns}"
	)


	class ColumnSchema(NamedTuple):
	"""A reference to a column: maps a name to use for a column in code to the actual column name as it appears in data."""

	column_generic_name: str
	"""The name of a field of some `Columns`-extending `class`.
	"""

	actual_column_name: str
	"""The actual column name in a Pandas DataFrame.
	"""


	def missing_columns(contract: Type[Minimum], cols: Specific, df: pd.DataFrame) -> List[ColumnSchema]:
	"""Returns a list of columns missing from the dataframe.

	The required columns are specified by `contract`, while the actual column value names are in `cols`.

	In the returned list, the
	"""
	if not isinstance(cols, contract):
	raise TypeError(f"Expecting {type(cols)} to be a subtype of {contract}")
	return [
	ColumnSchema(column_generic_name, getattr(cols, column_generic_name))
	for column_generic_name in contract.__dataclass_fields__.keys()
	if getattr(cols, column_generic_name) not in df.columns
	]