Skip to content

Instantly share code, notes, and snippets.

@daskol
Created May 5, 2022 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daskol/190d747a684ebd5edc4d50f9d2f50a26 to your computer and use it in GitHub Desktop.
Save daskol/190d747a684ebd5edc4d50f9d2f50a26 to your computer and use it in GitHub Desktop.
Repartitioning of LSapp dataset
import pandas as pd
def pick_uniques(df: pd.DataFrame, size: int = 100):
assert a.index.is_monotonic
assert 'uid' in df
index = []
uids = set()
for i, uid in df.uid.items():
if uid in uids:
continue
uids.add(uid)
index.append(i)
if len(uids) == size:
break
head = df.loc[index]
rest = df.loc[df.index.difference(index)]
return head, rest
def convolve(frame: pd.DataFrame, size: int = 50):
parts = []
pbar = tqdm(total=len(frame) // size + 1)
while not frame.empty:
head, frame = pick_uniques(frame, size)
head['subkey'] = len(parts)
parts.append(head)
return pd.concat(parts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment