tswast/generate_avro.py

## generate_avro.py
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import random

import fastavro
import fastavro.write


TOTAL_FILES = 5
NULL_PROBABILITY = 0.01
ROWS = 10000
ALPHABET = "abcdefghijklmnopqrstuvwxyz"
MAX_STRING_LENGTH = 1000
MIN_FLOAT = -1e12
MAX_FLOAT = 1e12
CHARACTER_RANGES = [
    (0x20, 0x7F),
    (0xA0, 0xFF),
    (0x100, 0x17F),
    (0x180, 0x24F),
    (0x3040, 0x309F),
    (0x30A0, 0x30FF),
]


def nullable(func):
    if random.random() < NULL_PROBABILITY:
        return None
    return func()


def generate_group_key():
    return random.choice(ALPHABET) + random.choice(ALPHABET) + random.choice(ALPHABET)


def generate_string():
    output = []
    string_length = random.randint(0, MAX_STRING_LENGTH)
    for _ in range(string_length):
        charrange = random.choice(CHARACTER_RANGES)
        charvalue = random.randint(*charrange)
        output.append(chr(charvalue))
    return "".join(output)


def generate_bytes():
    output = []
    string_length = random.randint(0, MAX_STRING_LENGTH)
    for _ in range(string_length):
        output.append(random.randint(0, 255))
    return bytes(output)


def generate_date():
    if random.random() < NULL_PROBABILITY:
        return None
    return datetime.date(
        random.randint(1970, 2070), random.randint(1, 12), random.randint(1, 28),
    ).isoformat()


def generate_timestamp():
    return datetime.datetime(
        random.randint(1970, 2070),
        random.randint(1, 12),
        random.randint(1, 28),
        random.randint(0, 23),
        random.randint(0, 59),
        random.randint(0, 59),
        random.randint(0, 999999),
        tzinfo=datetime.timezone.utc,
    ).isoformat("T")


def generate_row():
    return {
        "group_key": generate_group_key(),
        "bool_col": nullable(lambda: random.choice((True, False))),
        "bytes_col": nullable(generate_bytes),
        "date_col": nullable(generate_date),
        "float_col": nullable(lambda: random.uniform(MIN_FLOAT, MAX_FLOAT)),
        "int_col": nullable(lambda: random.randint(-(2 ** 63), 2 ** 63 - 1)),
        "string_col": nullable(generate_string),
        "timestamp_col": nullable(generate_timestamp),
    }


json_schema = {
    "type": "record",
    "name": "randomized_data",
    "namespace": "spannerexport",
    "fields": [
        {"name": "group_key", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
        {"name": "bool_col", "type": ["null", "boolean"], "sqlType": "BOOL"},
        {"name": "bytes_col", "type": ["null", "bytes"], "sqlType": "BYTES(MAX)"},
        {"name": "date_col", "type": ["null", "string"], "sqlType": "DATE"},
        {"name": "float_col", "type": ["null", "double"], "sqlType": "FLOAT64"},
        {"name": "int_col", "type": ["null", "long"], "sqlType": "INT64"},
        {"name": "string_col", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
        {"name": "timestamp_col", "type": ["null", "string"], "sqlType": "TIMESTAMP"},
    ],
    "googleStorage": "CloudSpanner",
    "spannerPrimaryKey": "`int_col` ASC",
    "spannerPrimaryKey_0": "`int_col` ASC",
    "googleFormatVersion": "1.0.0",
}
parsed_schema = fastavro.parse_schema(json_schema)

for filenum in range(TOTAL_FILES):
    rows = [generate_row() for _ in range(ROWS)]
    with open(f"randomized_data.avro-{filenum:05}-of-{TOTAL_FILES:05}", "wb") as out:
        fastavro.write.writer(out, parsed_schema, rows)

## generate_manifest.py
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import json
import pathlib
import hashlib


files = []

for filepath in pathlib.Path(__file__).parent.glob("randomized_data.avro-*"):
    md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        md5.update(f.read())
        files.append(
            {
                "name": filepath.name,
                "md5": base64.b64encode(md5.digest()).decode("utf-8"),
            }
        )

with open("randomized_data-manifest.json", "w") as f:
    json.dump({"files": files}, f, indent=2)

## spanner-export.json
{
  "tables": [{
    "name": "randomized_data",
    "manifestFile": "randomized_data-manifest.json"
  }]
}
	# Copyright 2020 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import datetime
	import random

	import fastavro
	import fastavro.write


	TOTAL_FILES = 5
	NULL_PROBABILITY = 0.01
	ROWS = 10000
	ALPHABET = "abcdefghijklmnopqrstuvwxyz"
	MAX_STRING_LENGTH = 1000
	MIN_FLOAT = -1e12
	MAX_FLOAT = 1e12
	CHARACTER_RANGES = [
	(0x20, 0x7F),
	(0xA0, 0xFF),
	(0x100, 0x17F),
	(0x180, 0x24F),
	(0x3040, 0x309F),
	(0x30A0, 0x30FF),
	]


	def nullable(func):
	if random.random() < NULL_PROBABILITY:
	return None
	return func()


	def generate_group_key():
	return random.choice(ALPHABET) + random.choice(ALPHABET) + random.choice(ALPHABET)


	def generate_string():
	output = []
	string_length = random.randint(0, MAX_STRING_LENGTH)
	for _ in range(string_length):
	charrange = random.choice(CHARACTER_RANGES)
	charvalue = random.randint(*charrange)
	output.append(chr(charvalue))
	return "".join(output)


	def generate_bytes():
	output = []
	string_length = random.randint(0, MAX_STRING_LENGTH)
	for _ in range(string_length):
	output.append(random.randint(0, 255))
	return bytes(output)


	def generate_date():
	if random.random() < NULL_PROBABILITY:
	return None
	return datetime.date(
	random.randint(1970, 2070), random.randint(1, 12), random.randint(1, 28),
	).isoformat()


	def generate_timestamp():
	return datetime.datetime(
	random.randint(1970, 2070),
	random.randint(1, 12),
	random.randint(1, 28),
	random.randint(0, 23),
	random.randint(0, 59),
	random.randint(0, 59),
	random.randint(0, 999999),
	tzinfo=datetime.timezone.utc,
	).isoformat("T")


	def generate_row():
	return {
	"group_key": generate_group_key(),
	"bool_col": nullable(lambda: random.choice((True, False))),
	"bytes_col": nullable(generate_bytes),
	"date_col": nullable(generate_date),
	"float_col": nullable(lambda: random.uniform(MIN_FLOAT, MAX_FLOAT)),
	"int_col": nullable(lambda: random.randint(-(2 63), 2 63 - 1)),
	"string_col": nullable(generate_string),
	"timestamp_col": nullable(generate_timestamp),
	}


	json_schema = {
	"type": "record",
	"name": "randomized_data",
	"namespace": "spannerexport",
	"fields": [
	{"name": "group_key", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
	{"name": "bool_col", "type": ["null", "boolean"], "sqlType": "BOOL"},
	{"name": "bytes_col", "type": ["null", "bytes"], "sqlType": "BYTES(MAX)"},
	{"name": "date_col", "type": ["null", "string"], "sqlType": "DATE"},
	{"name": "float_col", "type": ["null", "double"], "sqlType": "FLOAT64"},
	{"name": "int_col", "type": ["null", "long"], "sqlType": "INT64"},
	{"name": "string_col", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
	{"name": "timestamp_col", "type": ["null", "string"], "sqlType": "TIMESTAMP"},
	],
	"googleStorage": "CloudSpanner",
	"spannerPrimaryKey": "`int_col` ASC",
	"spannerPrimaryKey_0": "`int_col` ASC",
	"googleFormatVersion": "1.0.0",
	}
	parsed_schema = fastavro.parse_schema(json_schema)

	for filenum in range(TOTAL_FILES):
	rows = [generate_row() for _ in range(ROWS)]
	with open(f"randomized_data.avro-{filenum:05}-of-{TOTAL_FILES:05}", "wb") as out:
	fastavro.write.writer(out, parsed_schema, rows)
	{
	"tables": [{
	"name": "randomized_data",
	"manifestFile": "randomized_data-manifest.json"
	}]
	}