Skip to content

Instantly share code, notes, and snippets.

@jlinoff
Last active April 22, 2020 14:33
Show Gist options
  • Save jlinoff/29d7f5f0de4c2c99be256a7e8c9873fa to your computer and use it in GitHub Desktop.
Save jlinoff/29d7f5f0de4c2c99be256a7e8c9873fa to your computer and use it in GitHub Desktop.
Pyarrow based python script that checks a parquet file to see if it can be read
#!/usr/bin/env python3
'''
This script accepts a list of parquetfiles and then runs through them
to see which column entries in the schema can be read. It is useful
for determining where pyarrow does not yet support a particular
schema type.
'''
import os
import sys
import pyarrow.parquet as pq
import pyarrow.lib
# Colors
_R = '\x1b[31m'
_G = '\x1b[32m'
_0 = '\x1b[0m'
def print_entry(entry: str, arg: str, i: int):
'''
Check a single entry.
'''
pass_flag = False
path, etype = entry.split(':')
path = path.strip()
etype = etype.strip()
print(f'{i:>5} {entry} ', end='')
try:
_table = pq.read_table(arg, columns=[path])
print(f'{_G}OK', end='')
pass_flag = True
except pyarrow.lib.ArrowNotImplementedError as exc: # pylint: disable=c-extension-no-member
print(f'{_R}FAIL: {exc}', end='')
except pyarrow.lib.ArrowInvalid as exc: # pylint: disable=c-extension-no-member
print(f'{_R}FAIL: {exc}', end='')
except OSError as exc:
print(f'{_R}FAIL: {exc}', end='')
print(f'{_0}')
return pass_flag
def check(file_names: list):
'''
Check each parquet file on the command line.
'''
# Allow the user to disable color by setting NC=<anything> (except 'color').
color = os.getenv('NC', 'color') == 'color'
if not color:
global _R, _G, _0 # pylint: disable=global-statement
_R = _G = _0 = ''
for arg in file_names:
print(f'\x1b[1m{arg}\x1b[0m')
parquet_file = pq.ParquetFile(arg)
print(f'metadata: {parquet_file.metadata}')
entries = str(parquet_file.schema).strip().split('\n')
num_passed = num_failed = 0
i = 0
for entry in entries:
entry = entry.strip()
if ':' in entry:
i += 1
if print_entry(entry, arg, i):
num_passed += 1
else:
num_failed += 1
total = num_passed + num_failed
print(f'SUMMARY: total={total}, num_passed={num_passed}, num_failed={num_failed}')
if __name__ == '__main__':
check(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment