Skip to content

Instantly share code, notes, and snippets.

@paretech
Created March 20, 2022 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paretech/63e36af4c249d49d1573a6c057d9e2a3 to your computer and use it in GitHub Desktop.
Save paretech/63e36af4c249d49d1573a6c057d9e2a3 to your computer and use it in GitHub Desktop.
Shuffle multiple blocks of tabular data by unique value.
# %% [markdown]
# # Data Generation and Sequencing
# %%
import pandas as pd
import numpy as np
import random
# %%
primary = pd.DataFrame({'primary': ['A', 'B', 'C']})
secondary = pd.DataFrame({'secondary': [1, 2, 3]})
other = pd.DataFrame({'other': [1, 2, 3, 4]})
df = pd.DataFrame(primary).merge(secondary, how='cross').merge(other, how='cross')
df['more_data'] = 0
# %%
df
# %%
def shuffle_by_unique_values(df, key):
index = df[key].unique()
random.shuffle(index)
return df.set_index(key).loc[index].reset_index()
def shuffle_blocks(df, columns):
while len(columns) > 1:
shuffle_key = columns.pop()
df = df.groupby(columns, as_index=False).apply(shuffle_by_unique_values, shuffle_key).reset_index(drop=True)
return shuffle_by_unique_values(df.reset_index(drop=True), 'primary')
# %%
shuffle_blocks(df, ['primary', 'secondary', 'other'])
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment