Skip to content

Instantly share code, notes, and snippets.

Last active Dec 12, 2019
What would you like to do?
import pandas as pd
import numpy as np
import multiprocessing as mp
from functools import partial
def _gen_df(per_chunk, step, out, i):
print("generating df " + str(i) + " with " + str(per_chunk) + " rows..")
start_time = np.datetime64('1990-01-01', 'ns') + np.timedelta64(i * per_chunk, step)
idx = np.array([(start_time + np.timedelta64(i, step))
for i in range(per_chunk)]).astype('datetime64[ns]')
df = pd.DataFrame(index=idx,
data={'col1': np.random.randint(0,1e9,size=per_chunk),
'col2': np.random.uniform(0,1e9,size=per_chunk)})
outfile = out + str(i) + ".csv"
df.to_csv(outfile, header=False)
total = 1e10
chunks = 1000
per_chunk = int(total/chunks)
step = 'us' # each new row moves ahead 1 microsecond
output_dir = 'out/'
start_time = np.datetime64('1990-01-01', 'ns')
with mp.Pool(mp.cpu_count()) as pool:
results = pool.map_async(partial(_gen_df, per_chunk, step, output_dir), range(chunks), 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment