Skip to content

Instantly share code, notes, and snippets.

@tswast
Created July 23, 2020 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tswast/13210a1e47081114489e8bfbf8db9081 to your computer and use it in GitHub Desktop.
Save tswast/13210a1e47081114489e8bfbf8db9081 to your computer and use it in GitHub Desktop.
Generate Random Data for Google Cloud Spanner Import
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import random
import fastavro
import fastavro.write
TOTAL_FILES = 5
NULL_PROBABILITY = 0.01
ROWS = 10000
ALPHABET = "abcdefghijklmnopqrstuvwxyz"
MAX_STRING_LENGTH = 1000
MIN_FLOAT = -1e12
MAX_FLOAT = 1e12
CHARACTER_RANGES = [
(0x20, 0x7F),
(0xA0, 0xFF),
(0x100, 0x17F),
(0x180, 0x24F),
(0x3040, 0x309F),
(0x30A0, 0x30FF),
]
def nullable(func):
if random.random() < NULL_PROBABILITY:
return None
return func()
def generate_group_key():
return random.choice(ALPHABET) + random.choice(ALPHABET) + random.choice(ALPHABET)
def generate_string():
output = []
string_length = random.randint(0, MAX_STRING_LENGTH)
for _ in range(string_length):
charrange = random.choice(CHARACTER_RANGES)
charvalue = random.randint(*charrange)
output.append(chr(charvalue))
return "".join(output)
def generate_bytes():
output = []
string_length = random.randint(0, MAX_STRING_LENGTH)
for _ in range(string_length):
output.append(random.randint(0, 255))
return bytes(output)
def generate_date():
if random.random() < NULL_PROBABILITY:
return None
return datetime.date(
random.randint(1970, 2070), random.randint(1, 12), random.randint(1, 28),
).isoformat()
def generate_timestamp():
return datetime.datetime(
random.randint(1970, 2070),
random.randint(1, 12),
random.randint(1, 28),
random.randint(0, 23),
random.randint(0, 59),
random.randint(0, 59),
random.randint(0, 999999),
tzinfo=datetime.timezone.utc,
).isoformat("T")
def generate_row():
return {
"group_key": generate_group_key(),
"bool_col": nullable(lambda: random.choice((True, False))),
"bytes_col": nullable(generate_bytes),
"date_col": nullable(generate_date),
"float_col": nullable(lambda: random.uniform(MIN_FLOAT, MAX_FLOAT)),
"int_col": nullable(lambda: random.randint(-(2 ** 63), 2 ** 63 - 1)),
"string_col": nullable(generate_string),
"timestamp_col": nullable(generate_timestamp),
}
json_schema = {
"type": "record",
"name": "randomized_data",
"namespace": "spannerexport",
"fields": [
{"name": "group_key", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
{"name": "bool_col", "type": ["null", "boolean"], "sqlType": "BOOL"},
{"name": "bytes_col", "type": ["null", "bytes"], "sqlType": "BYTES(MAX)"},
{"name": "date_col", "type": ["null", "string"], "sqlType": "DATE"},
{"name": "float_col", "type": ["null", "double"], "sqlType": "FLOAT64"},
{"name": "int_col", "type": ["null", "long"], "sqlType": "INT64"},
{"name": "string_col", "type": ["null", "string"], "sqlType": "STRING(MAX)"},
{"name": "timestamp_col", "type": ["null", "string"], "sqlType": "TIMESTAMP"},
],
"googleStorage": "CloudSpanner",
"spannerPrimaryKey": "`int_col` ASC",
"spannerPrimaryKey_0": "`int_col` ASC",
"googleFormatVersion": "1.0.0",
}
parsed_schema = fastavro.parse_schema(json_schema)
for filenum in range(TOTAL_FILES):
rows = [generate_row() for _ in range(ROWS)]
with open(f"randomized_data.avro-{filenum:05}-of-{TOTAL_FILES:05}", "wb") as out:
fastavro.write.writer(out, parsed_schema, rows)
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import base64
import json
import pathlib
import hashlib
files = []
for filepath in pathlib.Path(__file__).parent.glob("randomized_data.avro-*"):
md5 = hashlib.md5()
with open(filepath, "rb") as f:
md5.update(f.read())
files.append(
{
"name": filepath.name,
"md5": base64.b64encode(md5.digest()).decode("utf-8"),
}
)
with open("randomized_data-manifest.json", "w") as f:
json.dump({"files": files}, f, indent=2)
{
"tables": [{
"name": "randomized_data",
"manifestFile": "randomized_data-manifest.json"
}]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment