This gist was created to document work described in dbt-labs/dbt-core#6073
Last active
March 27, 2023 14:17
-
-
Save boxysean/1a502e431beb8ed8b35f9652987ceabc to your computer and use it in GitHub Desktop.
Synthentic binary-tree dbt DAG builder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import json | |
import logging | |
import os | |
import random | |
import re | |
SIMPLE_MODEL = """SELECT 1""" | |
SIMPLE_YAML_HEADER = """ | |
version: 2 | |
models: | |
""" | |
SIMPLE_YAML_MODEL_TEMPLATE = """ | |
- name: {model_name} | |
columns: | |
- name: not_a_real_column | |
tests: | |
- noop_test_1 | |
- noop_test_2 | |
- noop_test_3 | |
- noop_test_4 | |
- noop_test_5 | |
""" | |
ADJECTIVES = [ | |
"evanescent", | |
"sedate", | |
"cold", | |
"ajar", | |
"flowery", | |
"giddy", | |
"longing", | |
"squalid", | |
"madly", | |
"nebulous", | |
"huge", | |
"defeated", | |
"smooth", | |
"rural", | |
"brawny", | |
"swift", | |
"unwieldy", | |
"willing", | |
"apathetic", | |
"dangerous", | |
"abashed", | |
"taboo", | |
"resonant", | |
"kaput", | |
"tidy", | |
"warlike", | |
"emotional", | |
"animated", | |
"arrogant", | |
"gruesome", | |
"pure", | |
"popular", | |
"fluttering", | |
"mature", | |
"aware", | |
"ambitious", | |
"obviously", | |
"stale", | |
"pale", | |
"abrupt", | |
"needy", | |
"alleged", | |
"lovely", | |
"obscene", | |
"common", | |
"inquisitive", | |
"jumbled", | |
] | |
ANIMALS = [ | |
"camel", | |
"gemsbok", | |
"porpoise", | |
"kitten", | |
"deer", | |
"cow", | |
"leopard", | |
"hamster", | |
"wildcat", | |
"guineapig", | |
"springbok", | |
"platypus", | |
"coati", | |
"whale", | |
"argali", | |
"panda", | |
"cougar", | |
"wolf", | |
"jaguar", | |
"impala", | |
"rat", | |
"chipmunk", | |
"quagga", | |
"rhinoceros", | |
"hippopotamus", | |
"coyote", | |
"mouse", | |
"hartebeest", | |
"aardvark", | |
"goat", | |
"tiger", | |
"walrus", | |
"anteater", | |
"toad", | |
"grizzly", | |
"mandrill", | |
"mule", | |
"mink", | |
"gazelle", | |
"seal", | |
"canary", | |
"mustang", | |
"horse", | |
"ferret", | |
"guanaco", | |
"marten", | |
"buffalo", | |
] | |
COLORS = [ | |
"darkorchid", | |
"lightsalmon", | |
"whitesmoke", | |
"pink", | |
"lightcyan", | |
"firebrick", | |
"mediumpurple", | |
"lightgreen", | |
"violetred", | |
"orange", | |
"lavenderblush", | |
"greenyellow", | |
"azure", | |
"lightcoral", | |
"snow", | |
"skyblue", | |
"wheat", | |
"tomato", | |
"darkturquoise", | |
"darkviolet", | |
"plum", | |
"darkslateblue", | |
"limegreen", | |
"darkgoldenrod", | |
"brown", | |
"turquoise", | |
"mintcream", | |
"darkslategray", | |
"bisque", | |
"blanchedalmond", | |
"antiquewhite", | |
"teal", | |
"linen", | |
"darkkhaki", | |
"paleturquoise", | |
"lightblue", | |
"lightyellow", | |
"orangered", | |
"mediumblue", | |
"slateblue", | |
"darksalmon", | |
"yellowgreen", | |
"honeydew", | |
"ivory", | |
"slategray", | |
"black", | |
"seagreen", | |
] | |
MODEL_NAME_TO_VERB_ANIMAL_PAIR = dict() | |
class UnknownNodeTypeException(Exception): | |
def __init__(self, node_name): | |
self.node_name = node_name | |
def dfs(node_number, max_layer): | |
if node_number & (1 << (max_layer-1)): | |
# create leaf node | |
with open(f"models/model_{node_number}.sql", "w") as f: | |
f.write(SIMPLE_MODEL) | |
else: | |
left_node_number = 2 * node_number | |
right_node_number = 2 * node_number + 1 | |
with open(f"models/model_{node_number}.sql", "w") as f: | |
f.write(f""" | |
-- {{{{ ref('model_{left_node_number}') }}}} | |
-- {{{{ ref('model_{right_node_number}') }}}} | |
SELECT 1 | |
""") | |
dfs(left_node_number, max_layer) | |
dfs(right_node_number, max_layer) | |
def main_tree(max_layer): | |
if not os.path.exists('models'): | |
os.mkdir('models') | |
dfs(1, max_layer) | |
with open("models/model.yml", "w") as f: | |
f.write(SIMPLE_YAML_HEADER) | |
for i in range(1 << max_layer): | |
f.write(SIMPLE_YAML_MODEL_TEMPLATE.format( | |
model_name=f"model_{i}" | |
)) | |
def main_simple(): | |
if not os.path.exists('models'): | |
os.mkdir('models') | |
for i in range(10): | |
with open(f"models/model_{i}.sql", "w") as f: | |
f.write(SIMPLE_MODEL) | |
with open("models/model.yml", "w") as f: | |
f.write(SIMPLE_YAML_HEADER) | |
for i in range(10): | |
f.write(SIMPLE_YAML_MODEL_TEMPLATE.format( | |
model_name=f"model_{i}" | |
)) | |
def _hash(s): | |
# import hashlib | |
# hash_object = hashlib.sha1(o.encode('ascii')) | |
# return hash_object.hexdigest()[0:16] | |
if s in MODEL_NAME_TO_VERB_ANIMAL_PAIR: | |
return MODEL_NAME_TO_VERB_ANIMAL_PAIR[s] | |
adjective_index = 0 | |
color_index = 0 | |
animal_index = 0 | |
def iterate(adjective_index, color_index, animal_index, x): | |
return ( | |
(adjective_index * 7 + x) % len(ADJECTIVES), | |
(color_index * 13 + x) % len(COLORS), | |
(animal_index * 11 + x) % len(ANIMALS), | |
) | |
for ch in s: | |
adjective_index, color_index, animal_index = iterate(adjective_index, color_index, animal_index, ord(ch)) | |
while True: | |
hidden_model_name = f"{ADJECTIVES[adjective_index]}_{COLORS[color_index]}_{ANIMALS[animal_index]}" | |
if hidden_model_name not in MODEL_NAME_TO_VERB_ANIMAL_PAIR.values(): | |
MODEL_NAME_TO_VERB_ANIMAL_PAIR[s] = hidden_model_name | |
# print(f"{s} -> {hidden_model_name}") | |
return hidden_model_name | |
adjective_index, color_index, animal_index = iterate(adjective_index, color_index, animal_index, random.randint(0, 50)) | |
def _deconstruct_node_name(node_name): | |
if node_name.startswith('model.'): | |
return re.sub('\.', '_', _hash(node_name[6:])), 'model' | |
elif node_name.startswith('test.'): | |
return re.sub('\.', '_', _hash(node_name[5:])), 'test' | |
elif node_name.startswith('source.'): | |
return re.sub('\.', '_', _hash(node_name[7:])), 'source' | |
elif node_name.startswith('seed.'): | |
return re.sub('\.', '_', _hash(node_name[4:])), 'seed' | |
elif node_name.startswith('analysis.'): | |
return re.sub('\.', '_', _hash(node_name[8:])), 'analysis' | |
elif node_name.startswith('snapshot.'): | |
return re.sub('\.', '_', _hash(node_name[8:])), 'snapshot' | |
else: | |
raise UnknownNodeTypeException(node_name) | |
def main_from_manifest_file(file_path): | |
if not os.path.exists('models'): | |
os.mkdir('models') | |
with open(file_path) as f: | |
manifest = json.load(f) | |
edges = { | |
node_name: node_values["depends_on"]["nodes"] | |
for node_name, node_values in manifest["nodes"].items() | |
} | |
tests = collections.defaultdict(set) | |
for child_node, parent_nodes in edges.items(): | |
try: | |
child_node_name, child_node_type = _deconstruct_node_name(child_node) | |
except UnknownNodeTypeException as e: | |
logging.warning("Skipping unknown node type from node: %s", e.node_name) | |
continue | |
if child_node_type == 'model': | |
with open(f"models/{child_node_name}.sql", "w") as f: | |
for parent_node in parent_nodes: | |
parent_node_name, parent_node_type = _deconstruct_node_name(parent_node) | |
if parent_node_type == 'model': | |
f.write(f'-- {{{{ ref("{ parent_node_name }") }}}}\n') | |
f.write('SELECT 1\n') | |
elif child_node_type == 'test': | |
model_exists_in_test = False | |
test_dependencies = [] | |
for parent_node in parent_nodes: | |
parent_node_name, parent_node_type = _deconstruct_node_name(parent_node) | |
if parent_node_type == 'model': | |
if not model_exists_in_test: | |
model_exists_in_test = True | |
test_dependencies.append(parent_node_name) | |
if model_exists_in_test: | |
tests[parent_node_name].add((child_node_name, frozenset(test_dependencies))) | |
for model_name, list_of_test_details in tests.items(): | |
list_of_test_details = sorted(list(list_of_test_details)) | |
with open(f"models/{model_name}.yml", "w") as f: | |
f.write(f""" | |
version: 2 | |
models: | |
- name: "{model_name}" | |
columns: | |
""") | |
for test_details in list_of_test_details: | |
test_node_name, model_dependencies = test_details | |
model_dependencies = sorted(list(model_dependencies)) | |
f.write(f""" | |
- name: test_salt_{test_node_name} | |
tests: | |
""") | |
for i, model_dependency in enumerate(model_dependencies): | |
f.write(f' - noop_test: {{model_{i}: "{model_dependency}"}}\n') | |
if __name__ == '__main__': | |
# main_tree(10) | |
main_from_manifest_file("manifest.json") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{% test noop_test_1(model, column_name) %} | |
SELECT 1 WHERE 1=0 | |
{% endtest %} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@patkearns10 also built these routines here: https://github.com/patkearns10/mock-dag :-)