Skip to content

Instantly share code, notes, and snippets.

@edsu
Created July 19, 2024 12:47
Show Gist options
  • Save edsu/f3231a5b2d6540aae4b6f9113659ef0d to your computer and use it in GitHub Desktop.
Save edsu/f3231a5b2d6540aae4b6f9113659ef0d to your computer and use it in GitHub Desktop.
import csv
import sys
from itertools import batched
import pyarrow
from pyarrow.parquet import ParquetWriter
csv.field_size_limit(sys.maxsize)
def csv_to_parquet(csv_file, parquet_file, batch_size=10_000):
csv_input = open(csv_file)
reader = csv.DictReader(csv_input)
# naively assume all columns are strings
schema = pyarrow.schema([(name, pyarrow.string()) for name in reader.fieldnames])
with ParquetWriter(open(parquet_file, 'wb'), schema, compression='SNAPPY') as writer:
for rows in batched(reader, batch_size):
table = pyarrow.Table.from_pylist(rows, schema)
writer.write_table(table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment