Skip to content

Instantly share code, notes, and snippets.

@JohnEmhoff
Created January 26, 2020 01:16
Show Gist options
  • Save JohnEmhoff/274f6e05cba3f17a16683eb394bfe6b5 to your computer and use it in GitHub Desktop.
Save JohnEmhoff/274f6e05cba3f17a16683eb394bfe6b5 to your computer and use it in GitHub Desktop.
import random
import pyorc
# use rows of random numbers and strings as test data
fields = ",".join([f"a{y}:string,b{y}:float" for y in range(25)])
spec = f"struct<{fields}>"
def gen_row():
tup = []
for _ in range(25):
tup.append(str(random.random()) * 10)
tup.append(random.random())
return tuple(tup)
data = [gen_row() for i in range(10000)]
for iteration in range(30):
print(iteration)
writers = []
# create 100 writers at a time
for n in range(100):
fp = open(f"out-{n}.orc", "wb")
# note the small stripe size
writers.append(pyorc.Writer(fp, spec, stripe_size=1 * 1024 * 1024))
# write 1 million rows, each to a random writer
for row in range(1000000):
w = random.choice(writers)
w.write(random.choice(data))
# close all of our writers
for w in writers:
w.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment