Skip to content

Instantly share code, notes, and snippets.

@edouarda edouarda/

Last active Dec 12, 2019
What would you like to do?
import pandas as pd
import numpy as np
import multiprocessing as mp
from functools import partial
def _gen_df(per_chunk, step, out, i):
print("generating df " + str(i) + " with " + str(per_chunk) + " rows..")
start_time = np.datetime64('1990-01-01', 'ns') + np.timedelta64(i * per_chunk, step)
idx = np.array([(start_time + np.timedelta64(i, step))
for i in range(per_chunk)]).astype('datetime64[ns]')
df = pd.DataFrame(index=idx,
data={'col1': np.random.randint(0,1e9,size=per_chunk),
'col2': np.random.uniform(0,1e9,size=per_chunk)})
outfile = out + str(i) + ".csv"
df.to_csv(outfile, header=False)
total = 1e10
chunks = 1000
per_chunk = int(total/chunks)
step = 'us' # each new row moves ahead 1 microsecond
output_dir = 'out/'
start_time = np.datetime64('1990-01-01', 'ns')
with mp.Pool(mp.cpu_count()) as pool:
results = pool.map_async(partial(_gen_df, per_chunk, step, output_dir), range(chunks), 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.