Skip to content

Instantly share code, notes, and snippets.

@bivald
Last active June 14, 2022 12:10
Show Gist options
  • Save bivald/f93448eaf25808284c4029c691a58a6a to your computer and use it in GitHub Desktop.
Save bivald/f93448eaf25808284c4029c691a58a6a to your computer and use it in GitHub Desktop.
# This file if written with pyarrow==2.0.0 can't be read by pyarrow==8.0.0
import pyarrow as pa
import pyarrow.parquet as pq
schema = pa.schema([
("col1", pa.int8()),
("col2", pa.string()),
("col3", pa.float64()),
("col4", pa.dictionary(pa.int32(), pa.string(), ordered=False))
])
table = pa.table([
[1, 2, 3, 4, 5],
["a", "b", "c", "d", "e"],
[1.0, 2.0, 3.0, 4.0, 5.0],
["a", "a", "a", "b", "b"]
], schema=schema)
output_file = 'test2.parq'
with pq.ParquetWriter(
output_file,
schema,
compression='snappy',
allow_truncated_timestamps=True,
version='2.0', # Highest available schema
data_page_version='2.0', # Highest available schema
# Convert these columns to categorical values, must be bytes keys as seen on
# https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
use_dictionary=[category.encode('utf-8') for category in ['col4']],
) as writer:
writer.write_table(
table,
row_group_size=10000
)
@bivald
Copy link
Author

bivald commented Jun 14, 2022

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment