Skip to content

Instantly share code, notes, and snippets.

@erelinz
Created August 7, 2023 20:28
Show Gist options
  • Save erelinz/b4dbd83522ee36bc52a34c89edace948 to your computer and use it in GitHub Desktop.
Save erelinz/b4dbd83522ee36bc52a34c89edace948 to your computer and use it in GitHub Desktop.
Read metadata and schema from parquet file and show the compression ratio and type for each column.
#!/usr/bin/python3
import sys
import pyarrow.parquet as pq
import os
def inspect_parquet(file_name):
"""Reads metadata from parquet file and prints details of each column."""
# Read metadata from parquet file
metadata = pq.read_metadata(file_name)
# Get schema of parquet file
schema = metadata.schema
print(f"{metadata.num_rows} rows, {len(schema)} columns.")
# Get total file size
total_size = os.path.getsize(file_name)
# Iterate over each column
for i, column_schema in enumerate(schema):
# Initialize total compressed size for column
total_compressed_size_column = 0
# Iterate over each row group
for j in range(metadata.num_row_groups):
# Get metadata for specific column in row group
column = metadata.row_group(j).column(i)
# Get compression details
compression = column.compression
# Get compressed and uncompressed sizes
compressed_size = column.total_compressed_size
uncompressed_size = column.total_uncompressed_size
# Accumulate compressed size for column
total_compressed_size_column += compressed_size
# Print compression details for column in row group
print(
f"Column {i+1} - {column_schema.name}, type: {column_schema.physical_type}, row group {j+1}: {compression}, {compressed_size/uncompressed_size:.1%} compression ratio"
)
# Print percentage of total file size taken by column
print(
f"Percentage of total file size by column {i+1} - {column_schema.name}: {total_compressed_size_column/total_size*100:.2f}%"
)
def main():
"""Main function to execute script"""
# Check if file name is provided
if len(sys.argv) != 2:
print("Usage: python inspect_parquet.py <parquet_file>")
sys.exit(1)
file_name = sys.argv[1]
# Call function to inspect parquet file
inspect_parquet(file_name)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment