Skip to content

Instantly share code, notes, and snippets.

@boxysean
Last active March 27, 2023 14:17
Show Gist options
  • Save boxysean/1a502e431beb8ed8b35f9652987ceabc to your computer and use it in GitHub Desktop.
Save boxysean/1a502e431beb8ed8b35f9652987ceabc to your computer and use it in GitHub Desktop.
Synthentic binary-tree dbt DAG builder
import collections
import json
import logging
import os
import random
import re
SIMPLE_MODEL = """SELECT 1"""
SIMPLE_YAML_HEADER = """
version: 2
models:
"""
SIMPLE_YAML_MODEL_TEMPLATE = """
- name: {model_name}
columns:
- name: not_a_real_column
tests:
- noop_test_1
- noop_test_2
- noop_test_3
- noop_test_4
- noop_test_5
"""
ADJECTIVES = [
"evanescent",
"sedate",
"cold",
"ajar",
"flowery",
"giddy",
"longing",
"squalid",
"madly",
"nebulous",
"huge",
"defeated",
"smooth",
"rural",
"brawny",
"swift",
"unwieldy",
"willing",
"apathetic",
"dangerous",
"abashed",
"taboo",
"resonant",
"kaput",
"tidy",
"warlike",
"emotional",
"animated",
"arrogant",
"gruesome",
"pure",
"popular",
"fluttering",
"mature",
"aware",
"ambitious",
"obviously",
"stale",
"pale",
"abrupt",
"needy",
"alleged",
"lovely",
"obscene",
"common",
"inquisitive",
"jumbled",
]
ANIMALS = [
"camel",
"gemsbok",
"porpoise",
"kitten",
"deer",
"cow",
"leopard",
"hamster",
"wildcat",
"guineapig",
"springbok",
"platypus",
"coati",
"whale",
"argali",
"panda",
"cougar",
"wolf",
"jaguar",
"impala",
"rat",
"chipmunk",
"quagga",
"rhinoceros",
"hippopotamus",
"coyote",
"mouse",
"hartebeest",
"aardvark",
"goat",
"tiger",
"walrus",
"anteater",
"toad",
"grizzly",
"mandrill",
"mule",
"mink",
"gazelle",
"seal",
"canary",
"mustang",
"horse",
"ferret",
"guanaco",
"marten",
"buffalo",
]
COLORS = [
"darkorchid",
"lightsalmon",
"whitesmoke",
"pink",
"lightcyan",
"firebrick",
"mediumpurple",
"lightgreen",
"violetred",
"orange",
"lavenderblush",
"greenyellow",
"azure",
"lightcoral",
"snow",
"skyblue",
"wheat",
"tomato",
"darkturquoise",
"darkviolet",
"plum",
"darkslateblue",
"limegreen",
"darkgoldenrod",
"brown",
"turquoise",
"mintcream",
"darkslategray",
"bisque",
"blanchedalmond",
"antiquewhite",
"teal",
"linen",
"darkkhaki",
"paleturquoise",
"lightblue",
"lightyellow",
"orangered",
"mediumblue",
"slateblue",
"darksalmon",
"yellowgreen",
"honeydew",
"ivory",
"slategray",
"black",
"seagreen",
]
MODEL_NAME_TO_VERB_ANIMAL_PAIR = dict()
class UnknownNodeTypeException(Exception):
def __init__(self, node_name):
self.node_name = node_name
def dfs(node_number, max_layer):
if node_number & (1 << (max_layer-1)):
# create leaf node
with open(f"models/model_{node_number}.sql", "w") as f:
f.write(SIMPLE_MODEL)
else:
left_node_number = 2 * node_number
right_node_number = 2 * node_number + 1
with open(f"models/model_{node_number}.sql", "w") as f:
f.write(f"""
-- {{{{ ref('model_{left_node_number}') }}}}
-- {{{{ ref('model_{right_node_number}') }}}}
SELECT 1
""")
dfs(left_node_number, max_layer)
dfs(right_node_number, max_layer)
def main_tree(max_layer):
if not os.path.exists('models'):
os.mkdir('models')
dfs(1, max_layer)
with open("models/model.yml", "w") as f:
f.write(SIMPLE_YAML_HEADER)
for i in range(1 << max_layer):
f.write(SIMPLE_YAML_MODEL_TEMPLATE.format(
model_name=f"model_{i}"
))
def main_simple():
if not os.path.exists('models'):
os.mkdir('models')
for i in range(10):
with open(f"models/model_{i}.sql", "w") as f:
f.write(SIMPLE_MODEL)
with open("models/model.yml", "w") as f:
f.write(SIMPLE_YAML_HEADER)
for i in range(10):
f.write(SIMPLE_YAML_MODEL_TEMPLATE.format(
model_name=f"model_{i}"
))
def _hash(s):
# import hashlib
# hash_object = hashlib.sha1(o.encode('ascii'))
# return hash_object.hexdigest()[0:16]
if s in MODEL_NAME_TO_VERB_ANIMAL_PAIR:
return MODEL_NAME_TO_VERB_ANIMAL_PAIR[s]
adjective_index = 0
color_index = 0
animal_index = 0
def iterate(adjective_index, color_index, animal_index, x):
return (
(adjective_index * 7 + x) % len(ADJECTIVES),
(color_index * 13 + x) % len(COLORS),
(animal_index * 11 + x) % len(ANIMALS),
)
for ch in s:
adjective_index, color_index, animal_index = iterate(adjective_index, color_index, animal_index, ord(ch))
while True:
hidden_model_name = f"{ADJECTIVES[adjective_index]}_{COLORS[color_index]}_{ANIMALS[animal_index]}"
if hidden_model_name not in MODEL_NAME_TO_VERB_ANIMAL_PAIR.values():
MODEL_NAME_TO_VERB_ANIMAL_PAIR[s] = hidden_model_name
# print(f"{s} -> {hidden_model_name}")
return hidden_model_name
adjective_index, color_index, animal_index = iterate(adjective_index, color_index, animal_index, random.randint(0, 50))
def _deconstruct_node_name(node_name):
if node_name.startswith('model.'):
return re.sub('\.', '_', _hash(node_name[6:])), 'model'
elif node_name.startswith('test.'):
return re.sub('\.', '_', _hash(node_name[5:])), 'test'
elif node_name.startswith('source.'):
return re.sub('\.', '_', _hash(node_name[7:])), 'source'
elif node_name.startswith('seed.'):
return re.sub('\.', '_', _hash(node_name[4:])), 'seed'
elif node_name.startswith('analysis.'):
return re.sub('\.', '_', _hash(node_name[8:])), 'analysis'
elif node_name.startswith('snapshot.'):
return re.sub('\.', '_', _hash(node_name[8:])), 'snapshot'
else:
raise UnknownNodeTypeException(node_name)
def main_from_manifest_file(file_path):
if not os.path.exists('models'):
os.mkdir('models')
with open(file_path) as f:
manifest = json.load(f)
edges = {
node_name: node_values["depends_on"]["nodes"]
for node_name, node_values in manifest["nodes"].items()
}
tests = collections.defaultdict(set)
for child_node, parent_nodes in edges.items():
try:
child_node_name, child_node_type = _deconstruct_node_name(child_node)
except UnknownNodeTypeException as e:
logging.warning("Skipping unknown node type from node: %s", e.node_name)
continue
if child_node_type == 'model':
with open(f"models/{child_node_name}.sql", "w") as f:
for parent_node in parent_nodes:
parent_node_name, parent_node_type = _deconstruct_node_name(parent_node)
if parent_node_type == 'model':
f.write(f'-- {{{{ ref("{ parent_node_name }") }}}}\n')
f.write('SELECT 1\n')
elif child_node_type == 'test':
model_exists_in_test = False
test_dependencies = []
for parent_node in parent_nodes:
parent_node_name, parent_node_type = _deconstruct_node_name(parent_node)
if parent_node_type == 'model':
if not model_exists_in_test:
model_exists_in_test = True
test_dependencies.append(parent_node_name)
if model_exists_in_test:
tests[parent_node_name].add((child_node_name, frozenset(test_dependencies)))
for model_name, list_of_test_details in tests.items():
list_of_test_details = sorted(list(list_of_test_details))
with open(f"models/{model_name}.yml", "w") as f:
f.write(f"""
version: 2
models:
- name: "{model_name}"
columns:
""")
for test_details in list_of_test_details:
test_node_name, model_dependencies = test_details
model_dependencies = sorted(list(model_dependencies))
f.write(f"""
- name: test_salt_{test_node_name}
tests:
""")
for i, model_dependency in enumerate(model_dependencies):
f.write(f' - noop_test: {{model_{i}: "{model_dependency}"}}\n')
if __name__ == '__main__':
# main_tree(10)
main_from_manifest_file("manifest.json")
{% test noop_test_1(model, column_name) %}
SELECT 1 WHERE 1=0
{% endtest %}
@boxysean
Copy link
Author

@patkearns10 also built these routines here: https://github.com/patkearns10/mock-dag :-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment