Delete all comments from a Stata do file
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# TODO: Comment code; include regexp? explanations | |
# TODO: parse with delimit in separate file; note this is imperfect. It | |
# has to be because of the way it works, which is super messy (specially | |
# for multi-line strings; i.e. stuff in quotes spanning many lines). | |
# TODO: scan code for `/*/`, `*/*`, and similar constructs. 'Please open | |
# and close all comment blocks explicitly' | |
# TODO: get rid of special case for locals? | |
""" | |
Delete all comments from Stata file | |
WARNINGS | |
-------- | |
Does not parse `#delimit ;` | |
`/*/*` and similar constructs are not parsed correctly. Note that as of | |
Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block | |
and starts a new one, however. | |
Usage | |
----- | |
From CLI: | |
$ python stataparse/comments.py /path/to/file.do | |
From Python: | |
>>> from StataComments import deleteStataComments | |
>>> doCode = deleteStataComments(open('/path/to/file.do', 'r').read()) | |
""" | |
from os import linesep | |
import regex | |
StataComment = { | |
'multiNested': regex.compile( | |
( | |
r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)' | |
r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)' | |
).format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'multiNestedEscape': regex.compile( | |
r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)' | |
r'|' | |
r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
'inline': [ | |
regex.compile( | |
r'^(?<space>\s*)//[^/].*$', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'^(?<space>\s*)///[\s\S]*?(^|\Z)', | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
), | |
regex.compile( | |
r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?(^|\Z)'.format(linesep), | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
], | |
'linestar': regex.compile( | |
r'^\s*\*[\s\S]*?(^|\Z)', | |
flags = regex.VERBOSE + regex.MULTILINE | |
) | |
} | |
StataMata = regex.compile( | |
r"(?<stata>.*?)" | |
r"(?<mata>" | |
r"(^\s*" | |
r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*" | |
r"mata\s*:?\s*$" | |
r")" | |
r".*?(\s*end.*?$|\Z)|\Z" | |
r")", | |
flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE | |
) | |
def deleteStataComments(doStr): | |
doStr = StataComment['multiNested'].sub( | |
'', | |
doStr | |
) | |
doStr = StataComment['multiNestedEscape'].sub( | |
deleteCStyle, | |
doStr | |
) | |
for regexp in StataComment['inline']: | |
doStr = regexp.sub( | |
'\g<space>', | |
doStr | |
) | |
doStr = StataMata.sub(deleteLineStar, doStr) | |
return doStr | |
def deleteCStyle(match): | |
if match.groupdict()['ignore']: | |
return match.groupdict()['ignore'] | |
elif match.groupdict()['delete']: | |
return '' | |
def deleteLineStar(match): | |
stata, mata = list(match.groupdict().values()) | |
rstr = "" | |
if stata: | |
rstr += StataComment['linestar'].sub('', stata) | |
if mata: | |
rstr += mata | |
return rstr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment