Skip to content

Instantly share code, notes, and snippets.

@yymao
Created June 3, 2022 21:54
Show Gist options
  • Save yymao/6a938713f8cfe84d01940d1f118daac1 to your computer and use it in GitHub Desktop.
Save yymao/6a938713f8cfe84d01940d1f118daac1 to your computer and use it in GitHub Desktop.
Convert hlist to parquet format
#/usr/bin/env python
import sys
import pyarrow as pa
import pyarrow.parquet as pq
from helpers.io_utils import hlist2pandas
import tqdm
def main(input_filename):
reader = hlist2pandas(input_filename, sanitize_column_names=True, chunksize=1000000)
table = pa.Table.from_pandas(next(reader), preserve_index=False)
schema = table.schema
with pq.ParquetWriter(input_filename.replace(".list", ".parquet"), schema, compression="snappy") as writer:
writer.write_table(table)
for table in tqdm.tqdm(reader):
writer.write_table(pa.Table.from_pandas(table, preserve_index=False))
if __name__ == "__main__":
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment