Skip to content

Instantly share code, notes, and snippets.

@grantmwilliams
Created January 28, 2022 17:02
Show Gist options
  • Save grantmwilliams/1ceb490312c59e4fb6e4bc15b57e9707 to your computer and use it in GitHub Desktop.
Save grantmwilliams/1ceb490312c59e4fb6e4bc15b57e9707 to your computer and use it in GitHub Desktop.
Pyarrow uint32() -> int64() bug?
import pyarrow as pa
import pyarrow.parquet as pq
file_name_mapping = {
pa.int32(): "int32",
pa.uint32(): "uint32",
pa.int64(): "int64",
pa.uint64(): "uint64"
}
int_types = [pa.int32(), pa.uint32(), pa.int64(), pa.uint64()]
def write_file(pa_type, file_name):
schema = pa.schema([
pa.field("idx", pa.string()),
pa.field("val", pa_type),
])
table = pa.Table.from_pydict({
'idx': ["A", "B", "C"],
'val': [4, 5, 6],
}, schema=schema)
pq.write_table(table, file_name, compression='snappy')
def read_file(pa_type, file_name):
with open(file_name, "rb") as fp:
parquet_file = pq.ParquetFile(fp)
col_type = parquet_file.schema_arrow.field("val").type
print(f"pa_type: {pa_type} -- schema_type: {col_type}")
print(parquet_file.schema.column(1))
print("-" * 40)
for pa_type in int_types:
file_name = f"pyarrow_bug/data/{file_name_mapping[pa_type]}_file.snappy.parquet"
write_file(pa_type, file_name)
read_file(pa_type, file_name)
print("-" * 40)
@grantmwilliams
Copy link
Author

The output from running the code looks like:

----------------------------------------
pa_type: int32 -- schema_type: int32
<ParquetColumnSchema>
  name: val
  path: val
  max_definition_level: 1
  max_repetition_level: 0
  physical_type: INT32
  logical_type: None
  converted_type (legacy): NONE
----------------------------------------
pa_type: uint32 -- schema_type: int64
<ParquetColumnSchema>
  name: val
  path: val
  max_definition_level: 1
  max_repetition_level: 0
  physical_type: INT64
  logical_type: None
  converted_type (legacy): NONE
----------------------------------------
pa_type: int64 -- schema_type: int64
<ParquetColumnSchema>
  name: val
  path: val
  max_definition_level: 1
  max_repetition_level: 0
  physical_type: INT64
  logical_type: None
  converted_type (legacy): NONE
----------------------------------------
pa_type: uint64 -- schema_type: uint64
<ParquetColumnSchema>
  name: val
  path: val
  max_definition_level: 1
  max_repetition_level: 0
  physical_type: INT64
  logical_type: Int(bitWidth=64, isSigned=false)
  converted_type (legacy): UINT_64
----------------------------------------```

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment