Created
August 7, 2023 20:28
-
-
Save erelinz/b4dbd83522ee36bc52a34c89edace948 to your computer and use it in GitHub Desktop.
Read metadata and schema from parquet file and show the compression ratio and type for each column.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import sys | |
import pyarrow.parquet as pq | |
import os | |
def inspect_parquet(file_name): | |
"""Reads metadata from parquet file and prints details of each column.""" | |
# Read metadata from parquet file | |
metadata = pq.read_metadata(file_name) | |
# Get schema of parquet file | |
schema = metadata.schema | |
print(f"{metadata.num_rows} rows, {len(schema)} columns.") | |
# Get total file size | |
total_size = os.path.getsize(file_name) | |
# Iterate over each column | |
for i, column_schema in enumerate(schema): | |
# Initialize total compressed size for column | |
total_compressed_size_column = 0 | |
# Iterate over each row group | |
for j in range(metadata.num_row_groups): | |
# Get metadata for specific column in row group | |
column = metadata.row_group(j).column(i) | |
# Get compression details | |
compression = column.compression | |
# Get compressed and uncompressed sizes | |
compressed_size = column.total_compressed_size | |
uncompressed_size = column.total_uncompressed_size | |
# Accumulate compressed size for column | |
total_compressed_size_column += compressed_size | |
# Print compression details for column in row group | |
print( | |
f"Column {i+1} - {column_schema.name}, type: {column_schema.physical_type}, row group {j+1}: {compression}, {compressed_size/uncompressed_size:.1%} compression ratio" | |
) | |
# Print percentage of total file size taken by column | |
print( | |
f"Percentage of total file size by column {i+1} - {column_schema.name}: {total_compressed_size_column/total_size*100:.2f}%" | |
) | |
def main(): | |
"""Main function to execute script""" | |
# Check if file name is provided | |
if len(sys.argv) != 2: | |
print("Usage: python inspect_parquet.py <parquet_file>") | |
sys.exit(1) | |
file_name = sys.argv[1] | |
# Call function to inspect parquet file | |
inspect_parquet(file_name) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment