Skip to content

Instantly share code, notes, and snippets.

@kylebarron
Forked from mcaceresb/deleteStataComments.py
Created August 7, 2018 16:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylebarron/e46ce79911976c3aae9d7e98309b9222 to your computer and use it in GitHub Desktop.
Save kylebarron/e46ce79911976c3aae9d7e98309b9222 to your computer and use it in GitHub Desktop.
Delete all comments from a Stata do file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# TODO: Comment code; include regexp? explanations
# TODO: parse with delimit in separate file; note this is imperfect. It
# has to be because of the way it works, which is super messy (specially
# for multi-line strings; i.e. stuff in quotes spanning many lines).
# TODO: scan code for `/*/`, `*/*`, and similar constructs. 'Please open
# and close all comment blocks explicitly'
# TODO: get rid of special case for locals?
"""
Delete all comments from Stata file
WARNINGS
--------
Does not parse `#delimit ;`
`/*/*` and similar constructs are not parsed correctly. Note that as of
Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block
and starts a new one, however.
Usage
-----
From CLI:
$ python stataparse/comments.py /path/to/file.do
From Python:
>>> from StataComments import deleteStataComments
>>> doCode = deleteStataComments(open('/path/to/file.do', 'r').read())
"""
from os import linesep
import regex
StataComment = {
'multiNested': regex.compile(
(
r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)'
r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)'
).format(linesep),
flags = regex.VERBOSE + regex.MULTILINE
),
'multiNestedEscape': regex.compile(
r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)'
r'|'
r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))',
flags = regex.VERBOSE + regex.MULTILINE
),
'inline': [
regex.compile(
r'^(?<space>\s*)//[^/].*$',
flags = regex.VERBOSE + regex.MULTILINE
),
regex.compile(
r'^(?<space>\s*)///[\s\S]*?(^|\Z)',
flags = regex.VERBOSE + regex.MULTILINE
),
regex.compile(
r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep),
flags = regex.VERBOSE + regex.MULTILINE
),
regex.compile(
r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?(^|\Z)'.format(linesep),
flags = regex.VERBOSE + regex.MULTILINE
)
],
'linestar': regex.compile(
r'^\s*\*[\s\S]*?(^|\Z)',
flags = regex.VERBOSE + regex.MULTILINE
)
}
StataMata = regex.compile(
r"(?<stata>.*?)"
r"(?<mata>"
r"(^\s*"
r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*"
r"mata\s*:?\s*$"
r")"
r".*?(\s*end.*?$|\Z)|\Z"
r")",
flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE
)
def deleteStataComments(doStr):
doStr = StataComment['multiNested'].sub(
'',
doStr
)
doStr = StataComment['multiNestedEscape'].sub(
deleteCStyle,
doStr
)
for regexp in StataComment['inline']:
doStr = regexp.sub(
'\g<space>',
doStr
)
doStr = StataMata.sub(deleteLineStar, doStr)
return doStr
def deleteCStyle(match):
if match.groupdict()['ignore']:
return match.groupdict()['ignore']
elif match.groupdict()['delete']:
return ''
def deleteLineStar(match):
stata, mata = list(match.groupdict().values())
rstr = ""
if stata:
rstr += StataComment['linestar'].sub('', stata)
if mata:
rstr += mata
return rstr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment