Skip to content

Instantly share code, notes, and snippets.

@MikeDacre
Last active May 25, 2017 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MikeDacre/bce7c7a4fd0d9457db47a3c199c2a45a to your computer and use it in GitHub Desktop.
Save MikeDacre/bce7c7a4fd0d9457db47a3c199c2a45a to your computer and use it in GitHub Desktop.
A python 3 function to make a pandas DataFrame unique on multiple columns while preserving other columns and filtering by multiple columns
def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6):
"""Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col.
Picks most frequent result of boolean merge_col if the result represents a portion of the
total greater than cutoff. Then sorts the results ascending by sort_col and returns the
first row only.
If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped.
Parameters
----------
df: pandas.core.frame.DataFrame
group_cols: list_of_str
A list of the column names to group by
merge_col: str
The name of the boolean column to filter
sort_col: str
The name of the column to sort by when picking top hit
Returns
-------
df: pandas.core.frame.DataFrame
DataFrame with the same columns as the starting df, but with rows unique on group_cols.
"""
def filter_group(d):
"""Use with groupby().apply(), filter DataFrame by boolean column and make unique.
Will sort ascending by sort_col and return the first result.
"""
match_info = d[merge_col].value_counts().to_dict()
# Pick best match result if it represents more than 60%
if len(match_info) == 1:
match_result = list(match_info.keys())[0]
else:
if (max(match_info[True], match_info[False])/
(match_info[True]+match_info[False])) < cutoff:
match_result = None
else:
if match_info[True] > match_info[False]:
match_result = True
else:
match_result = False
if match_result is None:
# Drop all somehow, e.g.
return pd.DataFrame()
new_d = d[d[merge_col] == match_result]
new_d.sort_values(sort_col, ascending=True, inplace=True)
# Return the one row with the lowest p as a dataframe
return new_d.iloc[0:1]
cols = df.columns
res = df.groupby(group_cols).apply(filter_group)
return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment