Skip to content

Instantly share code, notes, and snippets.

@edouarda
Last active December 12, 2019 15:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edouarda/a79ee1fd00bfe38d731870373359a304 to your computer and use it in GitHub Desktop.
Save edouarda/a79ee1fd00bfe38d731870373359a304 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import multiprocessing as mp
from functools import partial
def _gen_df(per_chunk, step, out, i):
print("generating df " + str(i) + " with " + str(per_chunk) + " rows..")
start_time = np.datetime64('1990-01-01', 'ns') + np.timedelta64(i * per_chunk, step)
idx = np.array([(start_time + np.timedelta64(i, step))
for i in range(per_chunk)]).astype('datetime64[ns]')
df = pd.DataFrame(index=idx,
data={'col1': np.random.randint(0,1e9,size=per_chunk),
'col2': np.random.uniform(0,1e9,size=per_chunk)})
outfile = out + str(i) + ".csv"
df.to_csv(outfile, header=False)
total = 1e10
chunks = 1000
per_chunk = int(total/chunks)
step = 'us' # each new row moves ahead 1 microsecond
output_dir = 'out/'
start_time = np.datetime64('1990-01-01', 'ns')
with mp.Pool(mp.cpu_count()) as pool:
results = pool.map_async(partial(_gen_df, per_chunk, step, output_dir), range(chunks), 1)
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment