Skip to content

Instantly share code, notes, and snippets.

@mreid-moz
Last active September 24, 2017 22:39
Show Gist options
  • Save mreid-moz/14f177f461486692af945176c1ede87d to your computer and use it in GitHub Desktop.
Save mreid-moz/14f177f461486692af945176c1ede87d to your computer and use it in GitHub Desktop.
Generate sample partitioned parquet data using pyarrow
# Generate a sample dataset with two partitioning fields
# `submission_date_s3` and `sample_id` and one or more
# actual parquet field.
#
# Arrow and Parquet reference material at
# https://arrow.apache.org/docs/python/parquet.html
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import random
dataset_name = "main_summary"
version = "v4"
compression = "snappy"
rows_per_partition = 10
columns = ["document_id", "a", "b"]
current_id = 1
for d in range(1, 4):
day = "submission_date_s3=2017010{}".format(d)
for p in range(1, 4):
part = "sample_id={}".format(p)
path = os.path.join(dataset_name, version, day, part)
try:
os.makedirs(path)
except Exception as e:
print e
output_file = "data{}.{}.parquet".format(current_id, compression)
output_path = os.path.join(path, output_file)
# Generate some data for this partition.
# Each row will have `d` columns. The `document_id` column will be
# a serial sequence of ints, while other values will be a
# random int between 1 and 10.
# Each `submission_date_s3` partition has an increasing number of columns
# to help test schema evolution.
data = {k: [random.randint(1,10) for i in range(rows_per_partition)] for k in columns[0:d]}
data["id"] = range(current_id, current_id + rows_per_partition)
current_id += rows_per_partition
df = pd.DataFrame(data)
# The `preserve_index` bit is to avoid creating the
# `__index_level_0__` fields.
table = pa.Table.from_pandas(df, preserve_index=False)
pq.write_table(table, output_path, compression=compression)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment