Skip to content

Instantly share code, notes, and snippets.

@bivald
Created September 21, 2020 09:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bivald/2ddbc853ce8da9a9a064d8b56a93fc95 to your computer and use it in GitHub Desktop.
Save bivald/2ddbc853ce8da9a9a064d8b56a93fc95 to your computer and use it in GitHub Desktop.
import time
import random
import string
import uuid
import os
import pyarrow as pa
pa.jemalloc_set_decay_ms(100)
import pyarrow.parquet as pq
columns = 9000
rows = 100
fields = []
for index in range(0,columns):
fields.append( pa.field(f"column{index}", pa.string()) )
arrow_schema = pa.schema(fields)
print("Creating a test dataset to write: this takes about a minute")
rows_dataframe = {}
for index in range(0,columns):
values = []
for row_index in range(0,rows):
values.append(str(uuid.uuid4()))
rows_dataframe[f"column{index}"] = values
print("Finished creating a test dataset")
output_file = 'test.parq'
pyarrow_table = pa.Table.from_pydict(
rows_dataframe,
arrow_schema
)
with pq.ParquetWriter(
output_file,
arrow_schema,
) as writer:
for i in range(0,50000):
start_time = time.time()
writer.write_table(
pyarrow_table,
row_group_size=99
)
print("write_table", time.time()-start_time)
print("file size in mb: ", os.stat('test.parq').st_size/1024/1024)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment