mreid-moz/make_test_parquet_data.py

## make_test_parquet_data.py
# Generate a sample dataset with two partitioning fields
# `submission_date_s3` and `sample_id` and one or more
# actual parquet field.
#
# Arrow and Parquet reference material at
# https://arrow.apache.org/docs/python/parquet.html

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import random

dataset_name = "main_summary"
version = "v4"
compression = "snappy"
rows_per_partition = 10

columns = ["document_id", "a", "b"]

current_id = 1

for d in range(1, 4):
    day = "submission_date_s3=2017010{}".format(d)
    for p in range(1, 4):
        part = "sample_id={}".format(p)
        path = os.path.join(dataset_name, version, day, part)
        try:
            os.makedirs(path)
        except Exception as e:
            print e

        output_file = "data{}.{}.parquet".format(current_id, compression)
        output_path = os.path.join(path, output_file)

        # Generate some data for this partition.
        # Each row will have `d` columns. The `document_id` column will be
        # a serial sequence of ints, while other values will be a
        # random int between 1 and 10.
        # Each `submission_date_s3` partition has an increasing number of columns
        # to help test schema evolution.
        data = {k: [random.randint(1,10) for i in range(rows_per_partition)] for k in columns[0:d]}
        data["id"] = range(current_id, current_id + rows_per_partition)
        current_id += rows_per_partition
        df = pd.DataFrame(data)

        # The `preserve_index` bit is to avoid creating the
        # `__index_level_0__` fields.
        table = pa.Table.from_pandas(df, preserve_index=False)
        pq.write_table(table, output_path, compression=compression)
	# Generate a sample dataset with two partitioning fields
	# `submission_date_s3` and `sample_id` and one or more
	# actual parquet field.
	#
	# Arrow and Parquet reference material at
	# https://arrow.apache.org/docs/python/parquet.html

	import pandas as pd
	import pyarrow as pa
	import pyarrow.parquet as pq
	import os
	import random

	dataset_name = "main_summary"
	version = "v4"
	compression = "snappy"
	rows_per_partition = 10

	columns = ["document_id", "a", "b"]

	current_id = 1

	for d in range(1, 4):
	day = "submission_date_s3=2017010{}".format(d)
	for p in range(1, 4):
	part = "sample_id={}".format(p)
	path = os.path.join(dataset_name, version, day, part)
	try:
	os.makedirs(path)
	except Exception as e:
	print e

	output_file = "data{}.{}.parquet".format(current_id, compression)
	output_path = os.path.join(path, output_file)

	# Generate some data for this partition.
	# Each row will have `d` columns. The `document_id` column will be
	# a serial sequence of ints, while other values will be a
	# random int between 1 and 10.
	# Each `submission_date_s3` partition has an increasing number of columns
	# to help test schema evolution.
	data = {k: [random.randint(1,10) for i in range(rows_per_partition)] for k in columns[0:d]}
	data["id"] = range(current_id, current_id + rows_per_partition)
	current_id += rows_per_partition
	df = pd.DataFrame(data)

	# The `preserve_index` bit is to avoid creating the
	# `__index_level_0__` fields.
	table = pa.Table.from_pandas(df, preserve_index=False)
	pq.write_table(table, output_path, compression=compression)