Skip to content

Instantly share code, notes, and snippets.

@HarlanH
Created August 3, 2017 20:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HarlanH/277006989774372515c7130e63809315 to your computer and use it in GitHub Desktop.
Save HarlanH/277006989774372515c7130e63809315 to your computer and use it in GitHub Desktop.
extracts structured comments/documentation from a dbt directory tree
#!/usr/bin/env python
# This Q&D script scans through SQL files in the models directory and outputs a Markdown document
# with per-model comments.
import os
from os import path
import re
import warnings
import time
dirs = ['models/staging', 'models/analytics', 'models/reporting'] # in order
for dir in dirs:
sql_files = [file for file in os.listdir(dir) if file.lower().endswith('.sql')]
for sql_file in sql_files:
# iterate over the file
# save all rows that look like a block comment
# save all rows that look like a column comment
# save all table reference rows
# these should never happen overlapping, which makes this easy...
with open(path.join(dir, sql_file), 'r') as infile:
# header
model_name = sql_file[:len(sql_file)-4]
print("# %s\n\n" % model_name)
block_comments = []
column_comments = []
table_refs = []
for line in infile:
line = line.strip()
#print(line)
# block comments
if line.startswith('--@'):
block_comments.append(line[3:].strip())
# column comments
# <from_col>(::<cast>) (AS <to_col>)(,) -- @col <comment>
# using an iterative regex strategy because I'm not smart enough to write it
# all at once...
col = re.search(r'[\s,]*(.+?)[\s,]*--\s*@col\s*(.*)', line)
if col:
sql = col.group(1)
comment = col.group(2)
rename = re.search(r'^(.*?)\s+(?:AS|as)?\s*(\w+)$', sql)
if rename:
src = rename.group(1)
target = rename.group(2)
else:
src = sql
target = None
cast = re.search(r'(.*)::(.*)', src)
if cast:
src = cast.group(1)
cast = cast.group(2)
if len(src) >= 30:
src = "_complex_"
column_comments.append({"from_col": src,
"cast": cast,
"to_col": target,
"comment": comment})
# table refs
res = re.search(r"{{\s+ref\('(\w+)'\)\s+}}", line)
if res:
table_refs.append(res.group(1))
for bc in block_comments:
print(bc)
print("\nColumns:\n")
for cc in column_comments:
if cc['to_col']:
col_name = cc['to_col']
col_from = "from `%s`" % cc['from_col']
else:
col_name = cc['from_col']
col_from = None
paran_stuff = []
if cc['cast']:
paran_stuff.append("`%s`" % cc['cast'])
if col_from:
paran_stuff.append(col_from)
print("* `%s` (%s) -- %s" % (col_name,
', '.join(paran_stuff),
cc['comment']))
print("\nTable References:\n")
for tr in table_refs:
print("* [%s](#%s)" % (tr, tr))
print("\n")
print("\n\nGenerated %s" % time.ctime())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment