Skip to content

Instantly share code, notes, and snippets.

@fulibacsi
Last active March 22, 2023 06:51
Show Gist options
  • Save fulibacsi/5251a2520f2a2d87e22dcccb4654d238 to your computer and use it in GitHub Desktop.
Save fulibacsi/5251a2520f2a2d87e22dcccb4654d238 to your computer and use it in GitHub Desktop.
deduplication on pandas dataframe with custom function
import pandas as pd
from tqdm import tqdm
def jaccard_sim(doc1, doc2, thres=0.9):
return len(doc1 & doc2) / len(doc1 | doc2) > thres
def duplicated(df, textcol, func, **kwargs):
"""Simlarly to pd.duplicated it finds the duplicated rows
based on a text column using a custom function.
Textcol parameter is assumed to be a single column name cotaining
textual data. The texts are first turned into a set of raw tokens and
then the comparison function is ran against every candidate rows.
Parameters:
-----------
df : pd.DataFrame
dataframe to find duplicates in
subset : str
target text column
func : function with signiture (doc1, doc2) -> {True|False}
comparison function, assumed to return True if the two
documents are the same or False otherwise
kwargs : dict
keyword arguments to pass to comparison function.
Returns:
--------
duplicates : pd.Series
Series of boolean values with the same indeces as df
"""
df['comp'] = df[textcol].apply(lambda x: tuple(set(x.split())))
duplicates = df.duplicated(subset='comp')
df['comp'] = df.comp.apply(lambda x: set(x))
df = df[['comp']]
for i in tqdm(range(len(df.index)-1)):
if duplicates[i]:
continue
subdf = df[i+1:]
subdup = duplicates[i+1:]
base = df.loc[df.index[i], 'comp']
candidate_indices = subdup.loc[~subdup].index
candidates = (
subdf
.loc[subdf.index.isin(candidate_indices), 'comp']
.apply(lambda x: func(base, x, **kwargs)))
duplicates = duplicates | candidates
return duplicates
def drop_duplicates(df, textcol, func, **kwargs):
"""Drops the duplicated rows from a dataframe based on
a textual column using a custom function.
Parameters:
-----------
See duplicated function.
Retuns:
-------
df : pd.DataFrame
Deduplicated dataframe
"""
duplicates = duplicated(df, subset, func, **kwargs)
return df.loc[~duplicates]
if __name__ == '__main__':
df = pd.read_csv('random_csv_with_text_col.csv')
print(df.shape)
df_deduped = drop_duplicates(df, 'textcol', jaccard_sim, thres=0.9)
print(df_deduped.shape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment