HarlanH/dbt_doc.py

## dbt_doc.py
#!/usr/bin/env python

# This Q&D script scans through SQL files in the models directory and outputs a Markdown document
# with per-model comments.

import os
from os import path
import re
import warnings
import time

dirs = ['models/staging', 'models/analytics', 'models/reporting'] # in order

for dir in dirs:
    sql_files = [file for file in os.listdir(dir) if file.lower().endswith('.sql')]
    for sql_file in sql_files:
         # iterate over the file
         # save all rows that look like a block comment
         # save all rows that look like a column comment
         # save all table reference rows
         # these should never happen overlapping, which makes this easy...

         with open(path.join(dir, sql_file), 'r') as infile:
            # header
            model_name = sql_file[:len(sql_file)-4]
            print("# %s\n\n" % model_name)

            block_comments = []
            column_comments = []
            table_refs = []

            for line in infile:
                line = line.strip()
                #print(line)
                # block comments
                if line.startswith('--@'):
                    block_comments.append(line[3:].strip())

                # column comments
                # <from_col>(::<cast>) (AS <to_col>)(,) -- @col <comment>
                # using an iterative regex strategy because I'm not smart enough to write it
                # all at once...
                col = re.search(r'[\s,]*(.+?)[\s,]*--\s*@col\s*(.*)', line)
                if col:
                    sql = col.group(1)
                    comment = col.group(2)
                    rename = re.search(r'^(.*?)\s+(?:AS|as)?\s*(\w+)$', sql)
                    if rename:
                        src = rename.group(1)
                        target = rename.group(2)
                    else:
                        src = sql
                        target = None
                    cast = re.search(r'(.*)::(.*)', src)
                    if cast:
                        src = cast.group(1)
                        cast = cast.group(2)
                    if len(src) >= 30:
                        src = "_complex_"
                    column_comments.append({"from_col": src,
                        "cast": cast,
                        "to_col": target,
                        "comment": comment})

                # table refs
                res = re.search(r"{{\s+ref\('(\w+)'\)\s+}}", line)
                if res:
                    table_refs.append(res.group(1))

            for bc in block_comments:
                print(bc)

            print("\nColumns:\n")
            for cc in column_comments:
                if cc['to_col']:
                    col_name = cc['to_col']
                    col_from = "from `%s`" % cc['from_col']
                else:
                    col_name = cc['from_col']
                    col_from = None
                paran_stuff = []
                if cc['cast']:
                    paran_stuff.append("`%s`" % cc['cast'])
                if col_from:
                    paran_stuff.append(col_from)
                print("* `%s` (%s) -- %s" % (col_name,
                                           ', '.join(paran_stuff),
                                            cc['comment']))

            print("\nTable References:\n")
            for tr in table_refs:
                print("* [%s](#%s)" % (tr, tr))
            print("\n")

print("\n\nGenerated %s" % time.ctime())
	#!/usr/bin/env python

	# This Q&D script scans through SQL files in the models directory and outputs a Markdown document
	# with per-model comments.

	import os
	from os import path
	import re
	import warnings
	import time

	dirs = ['models/staging', 'models/analytics', 'models/reporting'] # in order

	for dir in dirs:
	sql_files = [file for file in os.listdir(dir) if file.lower().endswith('.sql')]
	for sql_file in sql_files:
	# iterate over the file
	# save all rows that look like a block comment
	# save all rows that look like a column comment
	# save all table reference rows
	# these should never happen overlapping, which makes this easy...

	with open(path.join(dir, sql_file), 'r') as infile:
	# header
	model_name = sql_file[:len(sql_file)-4]
	print("# %s\n\n" % model_name)

	block_comments = []
	column_comments = []
	table_refs = []

	for line in infile:
	line = line.strip()
	#print(line)
	# block comments
	if line.startswith('--@'):
	block_comments.append(line[3:].strip())

	# column comments
	# <from_col>(::<cast>) (AS <to_col>)(,) -- @col <comment>
	# using an iterative regex strategy because I'm not smart enough to write it
	# all at once...
	col = re.search(r'[\s,](.+?)[\s,]--\s@col\s(.*)', line)
	if col:
	sql = col.group(1)
	comment = col.group(2)
	rename = re.search(r'^(.?)\s+(?:AS\|as)?\s(\w+)$', sql)
	if rename:
	src = rename.group(1)
	target = rename.group(2)
	else:
	src = sql
	target = None
	cast = re.search(r'(.)::(.)', src)
	if cast:
	src = cast.group(1)
	cast = cast.group(2)
	if len(src) >= 30:
	src = "_complex_"
	column_comments.append({"from_col": src,
	"cast": cast,
	"to_col": target,
	"comment": comment})

	# table refs
	res = re.search(r"{{\s+ref\('(\w+)'\)\s+}}", line)
	if res:
	table_refs.append(res.group(1))

	for bc in block_comments:
	print(bc)

	print("\nColumns:\n")
	for cc in column_comments:
	if cc['to_col']:
	col_name = cc['to_col']
	col_from = "from `%s`" % cc['from_col']
	else:
	col_name = cc['from_col']
	col_from = None
	paran_stuff = []
	if cc['cast']:
	paran_stuff.append("`%s`" % cc['cast'])
	if col_from:
	paran_stuff.append(col_from)
	print("* `%s` (%s) -- %s" % (col_name,
	', '.join(paran_stuff),
	cc['comment']))

	print("\nTable References:\n")
	for tr in table_refs:
	print("* [%s](#%s)" % (tr, tr))
	print("\n")

	print("\n\nGenerated %s" % time.ctime())