Skip to content

Instantly share code, notes, and snippets.

@amerberg
Last active June 6, 2021 20:04
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 13 You must be signed in to fork a gist
  • Save amerberg/a273ca1e579ab573b499 to your computer and use it in GitHub Desktop.
Save amerberg/a273ca1e579ab573b499 to your computer and use it in GitHub Desktop.
A script to remove comments from LaTeX source
import ply.lex, argparse, io
#Usage
# python stripcomments.py input.tex > output.tex
# python stripcomments.py input.tex -e encoding > output.tex
#This utility is released under the WTFPL license: http://www.wtfpl.net/about/
def strip_comments(source):
tokens = (
'PERCENT', 'BEGINCOMMENT', 'ENDCOMMENT', 'BACKSLASH',
'CHAR', 'BEGINVERBATIM', 'ENDVERBATIM', 'NEWLINE', 'ESCPCT',
)
states = (
('linecomment', 'exclusive'),
('commentenv', 'exclusive'),
('verbatim', 'exclusive')
)
#Deal with escaped backslashes, so we don't think they're escaping %.
def t_ANY_BACKSLASH(t):
r"\\\\"
return t
#One-line comments
def t_PERCENT(t):
r"\%"
t.lexer.begin("linecomment")
#Escaped percent signs
def t_ESCPCT(t):
r"\\\%"
return t
#Comment environment, as defined by verbatim package
def t_BEGINCOMMENT(t):
r"\\begin\s*{\s*comment\s*}"
t.lexer.begin("commentenv")
#Verbatim environment (different treatment of comments within)
def t_BEGINVERBATIM(t):
r"\\begin\s*{\s*verbatim\s*}"
t.lexer.begin("verbatim")
return t
#Any other character in initial state we leave alone
def t_CHAR(t):
r"."
return t
def t_NEWLINE(t):
r"\n"
return t
#End comment environment
def t_commentenv_ENDCOMMENT(t):
r"\\end\s*{\s*comment\s*}"
#Anything after \end{comment} on a line is ignored!
t.lexer.begin('linecomment')
#Ignore comments of comment environment
def t_commentenv_CHAR(t):
r"."
pass
def t_commentenv_NEWLINE(t):
r"\n"
pass
#End of verbatim environment
def t_verbatim_ENDVERBATIM(t):
r"\\end\s*{\s*verbatim\s*}"
t.lexer.begin('INITIAL')
return t
#Leave contents of verbatim environment alone
def t_verbatim_CHAR(t):
r"."
return t
def t_verbatim_NEWLINE(t):
r"\n"
return t
#End a % comment when we get to a new line
def t_linecomment_ENDCOMMENT(t):
r"\n"
t.lexer.begin("INITIAL")
#Newline at the end of a line comment is stripped.
#Ignore anything after a % on a line
def t_linecomment_CHAR(t):
r"."
pass
lexer = ply.lex.lex()
lexer.input(source)
return u"".join([tok.value for tok in lexer])
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help = 'the file to strip comments from')
parser.add_argument('--encoding', '-e', default='utf-8')
args = parser.parse_args()
with io.open(args.filename, encoding=args.encoding) as f:
source = f.read()
print(strip_comments(source))
if __name__ == '__main__':
main()
@cshapeshifter
Copy link

cshapeshifter commented Apr 7, 2018

For anyone stumbling over this in the future: latexpand can reliably remove comments, too.

@amerberg
Copy link
Author

amerberg commented Mar 1, 2019

Wow, I completely forgot about this and didn't see all these comments. Thanks to everyone who has made improvements. I've added a comment to clarify the licensing situation.

@Franck-Dernoncourt
Copy link

To remove all the comments from a latex file, another option is to use use arxiv-latex-cleaner. Actively maintained, 1.2k GitHub stars, written in Python but no need to know Python.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment