Skip to content

Instantly share code, notes, and snippets.

@kylebarron
Created May 11, 2018 18:16
Show Gist options
  • Save kylebarron/7095c3619d4b35f5f11f1a4566b54964 to your computer and use it in GitHub Desktop.
Save kylebarron/7095c3619d4b35f5f11f1a4566b54964 to your computer and use it in GitHub Desktop.
Create Markdown file with description of Stata dta file
#! /usr/bin/env python3
"""
---------------------------------------------------------------------
Program: stata_desc.py
Author: Kyle Barron <barronk@mit.edu>
Created: 5/11/2018, 2:01:13 PM
Purpose: Generate markdown data codebooks for documentation
The command line input should be the path to the `.dta` file.
This program outputs a Markdown file with extension `.md` with the same name as
the `.dta` file in its same directory. You can then use
[Pandoc](https://pandoc.org) to convert this Markdown file to PDF, docx, or
HTML.
The dependencies are Python 3.6 and pandas.
"""
import pandas as pd
from textwrap import dedent
from pathlib import Path
from sys import argv
infile = 'auto.dta'
def main():
try:
infile = argv[1]
except IndexError:
msg = 'Provide file path as command line argument'
raise ValueError(msg)
if not infile.endswith('.dta'):
msg = 'Path must end with ".dta"'
raise ValueError(msg)
path = Path(infile)
stub = path.parts[-1][:-4]
r = generate_codebook(str(path))
text = dedent(f"""\
# `{stub}.dta`
This file has {r[0]} columns and {r[1]} observations.
## Codebook
""")
text += r[2]
mdfile = path.with_suffix('.md')
with open(mdfile, 'w') as f:
f.write(text)
def generate_codebook(dta_path):
"""Generates Markdown text with variable names as the subheaders and
variable labels, type, and format in a list.
"""
itr = pd.read_stata(dta_path, iterator=True)
ncol = itr.nvar
nobs = itr.nobs
variables = []
for varname, varlabel in itr.variable_labels().items():
d = {}
d['name'] = varname
d['label'] = varlabel
variables.append(d)
for i in range(ncol):
# variables[i]['col_size'] = itr.col_sizes[i]
# variables[i]['dtype'] = itr.dtyplist[i]
variables[i]['fmt'] = itr.fmtlist[i]
variables[i]['type'] = itr.typlist[i]
text = []
var = variables[0]
for var in variables:
if type(var['type']) == int:
var_type = f"str{var['type']}"
elif var['type'] == 'b':
var_type = 'byte'
elif var['type'] == 'h':
var_type = 'int'
elif var['type'] == 'l':
var_type = 'long'
elif var['type'] == 'd':
var_type = 'double'
elif var['type'] == 'f':
var_type = 'float'
var_text = f"""\
### `{var['name']}`
- Label: {var['label']}
- Type: `{var_type}`
- Format: `{var['fmt']}`
"""
text.append(dedent(var_text))
return (ncol, nobs, ''.join(text))
if __name__ == '__main__':
main()
@kylebarron
Copy link
Author

Using the auto.dta file, doing

python stata_desc.py auto.dta
pandoc auto.md -o auto.pdf

gives the following PDF output:

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment