kylebarron/deleteStataComments.py

## deleteStataComments.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# TODO: Comment code; include regexp? explanations

# TODO: parse with delimit in separate file; note this is imperfect. It
# has to be because of the way it works, which is super messy (specially
# for multi-line strings; i.e. stuff in quotes spanning many lines).

# TODO: scan code for `/*/`, `*/*`, and similar constructs. 'Please open
# and close all comment blocks explicitly'

# TODO: get rid of special case for locals?

"""
Delete all comments from Stata file

WARNINGS
--------

Does not parse `#delimit ;`

`/*/*` and similar constructs are not parsed correctly. Note that as of
Stata 14, the parsing behavior of /*/ changed. `*/*` still ends a block
and starts a new one, however.

Usage
-----

From CLI:
$ python stataparse/comments.py /path/to/file.do

From Python:
>>> from StataComments import deleteStataComments
>>> doCode = deleteStataComments(open('/path/to/file.do', 'r').read())
"""

from os import linesep
import regex

StataComment = {
    'multiNested': regex.compile(
        (
            r'(?<!^\s*//.*|(?!\B"[^"]*)\s+//+(?![^"{0}]*"\B).*?|".*?)'
            r'/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z)'
        ).format(linesep),
        flags = regex.VERBOSE + regex.MULTILINE
    ),
    'multiNestedEscape': regex.compile(
        r'(?<ignore>\s*//.*|(?!\B"[^"]*)\s+//(?![^"]*"\B).*?|".*?".*?)'
        r'|'
        r'(?<delete>/\*(?:(?!/\*|\*/)[\s\S]|(?R))*(\*/|\Z))',
        flags = regex.VERBOSE + regex.MULTILINE
    ),
    'inline': [
        regex.compile(
            r'^(?<space>\s*)//[^/].*$',
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'^(?<space>\s*)///[\s\S]*?(^|\Z)',
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'(?!\B"[^"]*)(?<space>\s+)//($|[^/])(?![^"{0}]*"\B).*?$'.format(linesep),
            flags = regex.VERBOSE + regex.MULTILINE
        ),
        regex.compile(
            r'(?!\B"[^"]*)(?<space>\s+)///(?![^"{0}]*"\B)[\s\S]*?(^|\Z)'.format(linesep),
            flags = regex.VERBOSE + regex.MULTILINE
        )
    ],
    'linestar': regex.compile(
        r'^\s*\*[\s\S]*?(^|\Z)',
        flags = regex.VERBOSE + regex.MULTILINE
    )
}

StataMata = regex.compile(
    r"(?<stata>.*?)"
    r"(?<mata>"
    r"(^\s*"
    r"(\s*(cap(t(u(re?)?)?)?|n(o(i(s(i(ly?)?)?)?)?)?|qui(e(t(ly?)?)?)?)(\s+:?|:?\s+))*"
    r"mata\s*:?\s*$"
    r")"
    r".*?(\s*end.*?$|\Z)|\Z"
    r")",
    flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE
)


def deleteStataComments(doStr):

    doStr = StataComment['multiNested'].sub(
        '',
        doStr
    )

    doStr = StataComment['multiNestedEscape'].sub(
        deleteCStyle,
        doStr
    )

    for regexp in StataComment['inline']:
        doStr = regexp.sub(
            '\g<space>',
            doStr
        )

    doStr = StataMata.sub(deleteLineStar, doStr)

    return doStr


def deleteCStyle(match):
    if match.groupdict()['ignore']:
        return match.groupdict()['ignore']
    elif match.groupdict()['delete']:
        return ''


def deleteLineStar(match):
    stata, mata = list(match.groupdict().values())
    rstr = ""
    if stata:
        rstr += StataComment['linestar'].sub('', stata)

    if mata:
        rstr += mata

    return rstr
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# TODO: Comment code; include regexp? explanations

	# TODO: parse with delimit in separate file; note this is imperfect. It
	# has to be because of the way it works, which is super messy (specially
	# for multi-line strings; i.e. stuff in quotes spanning many lines).

	# TODO: scan code for `//`, `/*`, and similar constructs. 'Please open
	# and close all comment blocks explicitly'

	# TODO: get rid of special case for locals?

	"""
	Delete all comments from Stata file

	WARNINGS
	--------

	Does not parse `#delimit ;`

	`//` and similar constructs are not parsed correctly. Note that as of
	Stata 14, the parsing behavior of // changed. `/*` still ends a block
	and starts a new one, however.

	Usage
	-----

	From CLI:
	$ python stataparse/comments.py /path/to/file.do

	From Python:
	>>> from StataComments import deleteStataComments
	>>> doCode = deleteStataComments(open('/path/to/file.do', 'r').read())
	"""

	from os import linesep
	import regex

	StataComment = {
	'multiNested': regex.compile(
	(
	r'(?<!^\s//.\|(?!\B"[^"])\s+//+(?![^"{0}]"\B).?\|".?)'
	r'/\(?:(?!/\\|\/)[\s\S]\|(?R))(\*/\|\Z)'
	).format(linesep),
	flags = regex.VERBOSE + regex.MULTILINE
	),
	'multiNestedEscape': regex.compile(
	r'(?<ignore>\s//.\|(?!\B"[^"])\s+//(?![^"]"\B).?\|".?".*?)'
	r'\|'
	r'(?<delete>/\(?:(?!/\\|\/)[\s\S]\|(?R))(\*/\|\Z))',
	flags = regex.VERBOSE + regex.MULTILINE
	),
	'inline': [
	regex.compile(
	r'^(?<space>\s)//[^/].$',
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'^(?<space>\s)///[\s\S]?(^\|\Z)',
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'(?!\B"[^"])(?<space>\s+)//($\|[^/])(?![^"{0}]"\B).*?$'.format(linesep),
	flags = regex.VERBOSE + regex.MULTILINE
	),
	regex.compile(
	r'(?!\B"[^"])(?<space>\s+)///(?![^"{0}]"\B)[\s\S]*?(^\|\Z)'.format(linesep),
	flags = regex.VERBOSE + regex.MULTILINE
	)
	],
	'linestar': regex.compile(
	r'^\s\[\s\S]*?(^\|\Z)',
	flags = regex.VERBOSE + regex.MULTILINE
	)
	}

	StataMata = regex.compile(
	r"(?<stata>.*?)"
	r"(?<mata>"
	r"(^\s*"
	r"(\s(cap(t(u(re?)?)?)?\|n(o(i(s(i(ly?)?)?)?)?)?\|qui(e(t(ly?)?)?)?)(\s+:?\|:?\s+))"
	r"mata\s:?\s$"
	r")"
	r".?(\send.*?$\|\Z)\|\Z"
	r")",
	flags = regex.VERBOSE + regex.DOTALL + regex.MULTILINE
	)


	def deleteStataComments(doStr):

	doStr = StataComment['multiNested'].sub(
	'',
	doStr
	)

	doStr = StataComment['multiNestedEscape'].sub(
	deleteCStyle,
	doStr
	)

	for regexp in StataComment['inline']:
	doStr = regexp.sub(
	'\g<space>',
	doStr
	)

	doStr = StataMata.sub(deleteLineStar, doStr)

	return doStr


	def deleteCStyle(match):
	if match.groupdict()['ignore']:
	return match.groupdict()['ignore']
	elif match.groupdict()['delete']:
	return ''


	def deleteLineStar(match):
	stata, mata = list(match.groupdict().values())
	rstr = ""
	if stata:
	rstr += StataComment['linestar'].sub('', stata)

	if mata:
	rstr += mata

	return rstr