bivald/parquet-with-use_dictionary.py

## parquet-with-use_dictionary.py
# This file if written with pyarrow==2.0.0 can't be read by pyarrow==8.0.0

import pyarrow as pa
import pyarrow.parquet as pq

schema = pa.schema([
    ("col1", pa.int8()),
    ("col2", pa.string()),
    ("col3", pa.float64()),
    ("col4", pa.dictionary(pa.int32(), pa.string(), ordered=False))
])

table = pa.table([
    [1, 2, 3, 4, 5],
    ["a", "b", "c", "d", "e"],
    [1.0, 2.0, 3.0, 4.0, 5.0],
    ["a", "a", "a", "b", "b"]
], schema=schema)

output_file = 'test2.parq'

with pq.ParquetWriter(
        output_file,
        schema,
        compression='snappy',
        allow_truncated_timestamps=True,
        version='2.0',  # Highest available schema
        data_page_version='2.0',  # Highest available schema
        # Convert these columns to categorical values, must be bytes keys as seen on
        # https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
        use_dictionary=[category.encode('utf-8') for category in ['col4']],
    ) as writer:
        writer.write_table(
            table,
            row_group_size=10000
        )
	# This file if written with pyarrow==2.0.0 can't be read by pyarrow==8.0.0

	import pyarrow as pa
	import pyarrow.parquet as pq

	schema = pa.schema([
	("col1", pa.int8()),
	("col2", pa.string()),
	("col3", pa.float64()),
	("col4", pa.dictionary(pa.int32(), pa.string(), ordered=False))
	])

	table = pa.table([
	[1, 2, 3, 4, 5],
	["a", "b", "c", "d", "e"],
	[1.0, 2.0, 3.0, 4.0, 5.0],
	["a", "a", "a", "b", "b"]
	], schema=schema)

	output_file = 'test2.parq'

	with pq.ParquetWriter(
	output_file,
	schema,
	compression='snappy',
	allow_truncated_timestamps=True,
	version='2.0', # Highest available schema
	data_page_version='2.0', # Highest available schema
	# Convert these columns to categorical values, must be bytes keys as seen on
	# https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python
	use_dictionary=[category.encode('utf-8') for category in ['col4']],
	) as writer:
	writer.write_table(
	table,
	row_group_size=10000
	)