kylebarron/stata_desc.py

## stata_desc.py
#! /usr/bin/env python3
"""
---------------------------------------------------------------------
Program: stata_desc.py
Author:  Kyle Barron <barronk@mit.edu>
Created: 5/11/2018, 2:01:13 PM
Purpose: Generate markdown data codebooks for documentation

The command line input should be the path to the `.dta` file.

This program outputs a Markdown file with extension `.md` with the same name as
the `.dta` file in its same directory. You can then use
[Pandoc](https://pandoc.org) to convert this Markdown file to PDF, docx, or
HTML.

The dependencies are Python 3.6 and pandas.
"""

import pandas as pd
from textwrap import dedent
from pathlib import Path
from sys import argv

infile = 'auto.dta'
def main():
    try:
        infile = argv[1]
    except IndexError:
        msg = 'Provide file path as command line argument'
        raise ValueError(msg)

    if not infile.endswith('.dta'):
        msg = 'Path must end with ".dta"'
        raise ValueError(msg)

    path = Path(infile)
    stub = path.parts[-1][:-4]

    r = generate_codebook(str(path))
    text = dedent(f"""\
    # `{stub}.dta`

    This file has {r[0]} columns and {r[1]} observations.

    ## Codebook

    """)
    text += r[2]

    mdfile = path.with_suffix('.md')
    with open(mdfile, 'w') as f:
        f.write(text)

def generate_codebook(dta_path):
    """Generates Markdown text with variable names as the subheaders and
    variable labels, type, and format in a list.
    """

    itr = pd.read_stata(dta_path, iterator=True)

    ncol = itr.nvar
    nobs = itr.nobs

    variables = []
    for varname, varlabel in itr.variable_labels().items():
        d = {}
        d['name'] = varname
        d['label'] = varlabel
        variables.append(d)

    for i in range(ncol):
        # variables[i]['col_size'] = itr.col_sizes[i]
        # variables[i]['dtype'] = itr.dtyplist[i]
        variables[i]['fmt'] = itr.fmtlist[i]
        variables[i]['type'] = itr.typlist[i]

    text = []

    var = variables[0]
    for var in variables:
        if type(var['type']) == int:
            var_type = f"str{var['type']}"
        elif var['type'] == 'b':
            var_type = 'byte'
        elif var['type'] == 'h':
            var_type = 'int'
        elif var['type'] == 'l':
            var_type = 'long'
        elif var['type'] == 'd':
            var_type = 'double'
        elif var['type'] == 'f':
            var_type = 'float'

        var_text = f"""\
        ### `{var['name']}`

        - Label: {var['label']}
        - Type: `{var_type}`
        - Format: `{var['fmt']}`

        """
        text.append(dedent(var_text))

    return (ncol, nobs, ''.join(text))

if __name__ == '__main__':
    main()
	#! /usr/bin/env python3
	"""
	---------------------------------------------------------------------
	Program: stata_desc.py
	Author: Kyle Barron <barronk@mit.edu>
	Created: 5/11/2018, 2:01:13 PM
	Purpose: Generate markdown data codebooks for documentation

	The command line input should be the path to the `.dta` file.

	This program outputs a Markdown file with extension `.md` with the same name as
	the `.dta` file in its same directory. You can then use
	[Pandoc](https://pandoc.org) to convert this Markdown file to PDF, docx, or
	HTML.

	The dependencies are Python 3.6 and pandas.
	"""

	import pandas as pd
	from textwrap import dedent
	from pathlib import Path
	from sys import argv

	infile = 'auto.dta'
	def main():
	try:
	infile = argv[1]
	except IndexError:
	msg = 'Provide file path as command line argument'
	raise ValueError(msg)

	if not infile.endswith('.dta'):
	msg = 'Path must end with ".dta"'
	raise ValueError(msg)

	path = Path(infile)
	stub = path.parts[-1][:-4]

	r = generate_codebook(str(path))
	text = dedent(f"""\
	# `{stub}.dta`

	This file has {r[0]} columns and {r[1]} observations.

	## Codebook

	""")
	text += r[2]

	mdfile = path.with_suffix('.md')
	with open(mdfile, 'w') as f:
	f.write(text)

	def generate_codebook(dta_path):
	"""Generates Markdown text with variable names as the subheaders and
	variable labels, type, and format in a list.
	"""

	itr = pd.read_stata(dta_path, iterator=True)

	ncol = itr.nvar
	nobs = itr.nobs

	variables = []
	for varname, varlabel in itr.variable_labels().items():
	d = {}
	d['name'] = varname
	d['label'] = varlabel
	variables.append(d)

	for i in range(ncol):
	# variables[i]['col_size'] = itr.col_sizes[i]
	# variables[i]['dtype'] = itr.dtyplist[i]
	variables[i]['fmt'] = itr.fmtlist[i]
	variables[i]['type'] = itr.typlist[i]

	text = []

	var = variables[0]
	for var in variables:
	if type(var['type']) == int:
	var_type = f"str{var['type']}"
	elif var['type'] == 'b':
	var_type = 'byte'
	elif var['type'] == 'h':
	var_type = 'int'
	elif var['type'] == 'l':
	var_type = 'long'
	elif var['type'] == 'd':
	var_type = 'double'
	elif var['type'] == 'f':
	var_type = 'float'

	var_text = f"""\
	### `{var['name']}`

	- Label: {var['label']}
	- Type: `{var_type}`
	- Format: `{var['fmt']}`

	"""
	text.append(dedent(var_text))

	return (ncol, nobs, ''.join(text))

	if __name__ == '__main__':
	main()