Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
def vectorized_approach(df, key='person_id', start='start', end='end'):
df = df.copy()
# start_end is an indicator for start or end
startdf = pd.DataFrame({key: df[key], 'time': df[start], 'start_end': 1})
enddf = pd.DataFrame({key: df[key], 'time': df[end], 'start_end': -1})
# concat and sort the whole thing by key and time
mergedf = pd.concat([startdf, enddf]).sort_values([key, 'time'])
# use cumsum to create gaps and islands
mergedf['cumsum'] = mergedf.groupby(key)['start_end'].cumsum()
# assign new start date
mergedf['new_start'] = mergedf['cumsum'].eq(1) & mergedf['start_end'].eq(1)
# use cumsum to assign group id
mergedf['group'] = mergedf.groupby(key)['new_start'].cumsum()
# group_id by choosing the start_date row
df['group_id'] = mergedf['group'].loc[mergedf['start_end'].eq(1)]
return df.groupby([key, 'group_id']).aggregate({start: min, end: max}).reset_index()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment