Skip to content

Instantly share code, notes, and snippets.

@JakeColor
Created August 27, 2021 21:14
Show Gist options
  • Save JakeColor/9d04d14bd0148ac7ee4bf331ceb21c27 to your computer and use it in GitHub Desktop.
Save JakeColor/9d04d14bd0148ac7ee4bf331ceb21c27 to your computer and use it in GitHub Desktop.
generate_sample_datasets
""" Generates a random dataset. """
import argparse
import os
import shutil
import numpy as np
import pandas as pd
import torch
_FILE_CT_DIGITS = 4
def generate_random_dataset(shape=(30_000,87)):
arr = np.random.rand(shape[0], shape[1])
return arr
def write_arr_as_numpy(arr, file_path):
with open(file_path, 'wb') as f:
np.save(f, arr)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generate Datasets')
parser.add_argument('-d', '--data-dir', type=str, help='Where to save generated data')
parser.add_argument('-b', '--batches', type=int, help='How many batches to generate')
args = parser.parse_args()
sub_dirs = {}
for file_format in ["f32", "f64"]:
sub_dir = os.path.join(args.data_dir, file_format)
shutil.rmtree(sub_dir)
os.makedirs(sub_dir)
sub_dirs[file_format] = sub_dir
for i in range(args.batches):
arr = generate_random_dataset()
file_name = "10000" + "-" + str(i).zfill(_FILE_CT_DIGITS)
for sub_dir in sub_dirs:
dtype = np.float32 if "f32" in sub_dir else np.float64
arr = arr.astype(dtype)
file_path = os.path.join(sub_dir, file_name+".npy")
write_arr_as_numpy(arr, file_path)
# python ~/app/src/generate_datasets_dtype.py -d /mnt/data/a100-slowness-debug -b 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment