MikeDacre/filter_dataframe.py

## filter_dataframe.py
def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6):
    """Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col.

    Picks most frequent result of boolean merge_col if the result represents a portion of the
    total greater than cutoff. Then sorts the results ascending by sort_col and returns the
    first row only.

    If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped.

    Parameters
    ----------
    df: pandas.core.frame.DataFrame
    group_cols: list_of_str
        A list of the column names to group by
    merge_col: str
        The name of the boolean column to filter
    sort_col: str
        The name of the column to sort by when picking top hit

    Returns
    -------
    df: pandas.core.frame.DataFrame
        DataFrame with the same columns as the starting df, but with rows unique on group_cols.
    """

    def filter_group(d):
        """Use with groupby().apply(), filter DataFrame by boolean column and make unique.

        Will sort ascending by sort_col and return the first result.
        """
        match_info = d[merge_col].value_counts().to_dict()
        # Pick best match result if it represents more than 60%
        if len(match_info) == 1:
            match_result = list(match_info.keys())[0]
        else:
            if (max(match_info[True], match_info[False])/
                (match_info[True]+match_info[False])) < cutoff:
                match_result = None
            else:
                if match_info[True] > match_info[False]:
                    match_result = True
                else:
                    match_result = False
        if match_result is None:
            # Drop all somehow, e.g.
            return pd.DataFrame()
        new_d = d[d[merge_col] == match_result]
        new_d.sort_values(sort_col, ascending=True, inplace=True)
        # Return the one row with the lowest p as a dataframe
        return new_d.iloc[0:1]

    cols = df.columns
    res = df.groupby(group_cols).apply(filter_group)
    return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols]
	def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6):
	"""Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col.

	Picks most frequent result of boolean merge_col if the result represents a portion of the
	total greater than cutoff. Then sorts the results ascending by sort_col and returns the
	first row only.

	If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped.

	Parameters
	----------
	df: pandas.core.frame.DataFrame
	group_cols: list_of_str
	A list of the column names to group by
	merge_col: str
	The name of the boolean column to filter
	sort_col: str
	The name of the column to sort by when picking top hit

	Returns
	-------
	df: pandas.core.frame.DataFrame
	DataFrame with the same columns as the starting df, but with rows unique on group_cols.
	"""

	def filter_group(d):
	"""Use with groupby().apply(), filter DataFrame by boolean column and make unique.

	Will sort ascending by sort_col and return the first result.
	"""
	match_info = d[merge_col].value_counts().to_dict()
	# Pick best match result if it represents more than 60%
	if len(match_info) == 1:
	match_result = list(match_info.keys())[0]
	else:
	if (max(match_info[True], match_info[False])/
	(match_info[True]+match_info[False])) < cutoff:
	match_result = None
	else:
	if match_info[True] > match_info[False]:
	match_result = True
	else:
	match_result = False
	if match_result is None:
	# Drop all somehow, e.g.
	return pd.DataFrame()
	new_d = d[d[merge_col] == match_result]
	new_d.sort_values(sort_col, ascending=True, inplace=True)
	# Return the one row with the lowest p as a dataframe
	return new_d.iloc[0:1]

	cols = df.columns
	res = df.groupby(group_cols).apply(filter_group)
	return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols]