Skip to content

Instantly share code, notes, and snippets.

@bioinfornatics
Last active March 17, 2020 16:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bioinfornatics/c82398fa22339d34f41b3580c988c308 to your computer and use it in GitHub Desktop.
Save bioinfornatics/c82398fa22339d34f41b3580c988c308 to your computer and use it in GitHub Desktop.
pyarrow write 2D list into table
#!/usr/bin/env python3
import pyarrow.parquet as pq
import pyarrow as pa
import random
import string
from uuid import uuid4
def random_str():
str_length = random.randint(40,50)
return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length))
if __name__ == '__main__':
my_schema = pa.schema([
('txt1', pa.string()),
('txt2', pa.string()),
('txt3', pa.string()),
('txt4', pa.string()),
('txt5', pa.string())])
nb_rows = 100
rows = [[str(uuid4())] + [random_str() for _ in range(4)] for iteration in range(0,nb_rows)]
table = None
columns = [ [] for item in rows[0] ]
for row in rows:
for i, item in enumerate(row):
columns[i].append(item)
arrays = [ pa.array(column) for column in columns]
table = pa.Table.from_arrays(arrays,schema=my_schema)
with pq.ParquetWriter('example.parquet', table.schema, use_dictionary=True, compression='snappy') as writer:
writer.write_table(table) # write one row_group
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment