Skip to content

Instantly share code, notes, and snippets.

@CaselIT
Created December 18, 2023 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CaselIT/969f1da4f75c91a416b1d51538f53ffb to your computer and use it in GitHub Desktop.
Save CaselIT/969f1da4f75c91a416b1d51538f53ffb to your computer and use it in GitHub Desktop.
Create random dataframe with specified number of partitions of random size
# License: MIT
from argparse import ArgumentParser
from itertools import product
import numpy as np
import pandas as pd
_r = 8750
_c = 15
_sizes = {
"1m": (_r, _c),
"10m": (_r * 10, _c),
"100m": (_r * 100, _c),
"500m": (_r * 500, _c),
"1g": (_r * 1000, _c),
"1g-w": (_r * 100, _c * 10),
"4g": (_r * 1000, _c * 4 + 1),
}
SEED = None
def make_df(
size: str,
n_groups: int,
n_group_cols: int = 1,
*,
log: bool = False,
seed=42,
ordered_by_partitions: bool = False,
):
n_rows, n_cols = _sizes[size]
assert n_rows >= n_groups * 2
assert n_group_cols < 5 and n_cols >= n_group_cols
rng = np.random.default_rng(seed=seed)
random_cols = n_cols - n_group_cols
df = pd.DataFrame(
rng.uniform(size=(n_rows, random_cols)),
columns=[f"c{i}" for i in range(random_cols)],
)
gb_names = "ABCDE"
gb_cols = list(gb_names[:n_group_cols])
splits = set()
while len(splits) < n_groups - 1:
splits.add(rng.integers(1, n_rows))
splits.add(n_rows)
start = 0
arr = []
for i, end in enumerate(sorted(splits)):
arr.append(np.full(end - start, i))
start = end
values = np.concatenate(arr)
if not ordered_by_partitions:
rng.shuffle(values)
assert len(np.unique(values)) == n_groups, (len(np.unique(values)), n_groups)
if n_group_cols == 1:
df[gb_names[0]] = values.astype(np.int64)
else:
num = int(np.ceil(np.power(n_groups, 1 / n_group_cols)))
comb = np.array(list(product(*(range(num) for _ in range(n_group_cols)))), dtype=np.int64)
df[gb_cols] = comb[values]
if log:
df.info(verbose=False)
return df, gb_cols
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--size", default="1g")
parser.add_argument("--n_groups", type=int, default=100)
parser.add_argument("--n-group-cols", type=int)
parser.add_argument("--seed", type=int)
args = parser.parse_args()
make_df(**{k: v for k, v in args.__dict__.items() if v is not None}, log=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment