Created
August 4, 2018 21:00
-
-
Save pweids/6769d9ed3a087a04dd65c8ceb6c7e2f2 to your computer and use it in GitHub Desktop.
Two different methods of stripping /* ... */ style comments. One using a finite state machine, and one using a more pythonic recursive function. The latter is much faster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from enum import Enum | |
def strip_C_comments(code: str) -> str: | |
""" | |
This function strips out C style /* comments */ from code | |
It is Pythonic and 100x faster than strip_by_char | |
:param code: the code to be parsed for comments | |
:return: code without comments | |
""" | |
cmt_open = code.find('/*') | |
if cmt_open is -1: | |
return code | |
cmt_close = code.find('*/', cmt_open) | |
if cmt_close is -1: | |
raise ValueError("syntax error: unclosed comment") | |
return strip_C_comments(code[:cmt_open] + code[cmt_close + 2:].lstrip()) | |
def strip_by_char(code: str) -> str: | |
""" | |
This function is similar to strip_C_comments() but does it with a finite | |
state machine char-by-char | |
:param code: the code to be parse for comments | |
:return: code without comments | |
""" | |
class State(Enum): | |
OUTSIDE = 1 | |
START = 2 | |
INSIDE = 3 | |
FINISH = 4 | |
CLOSED = 5 | |
state = State.OUTSIDE | |
outstr = [] | |
for c in code: | |
if state is State.OUTSIDE: | |
if c is '/': | |
state = State.START | |
else: outstr.append(c) | |
elif state is State.START: | |
if c is '*': | |
state = State.INSIDE | |
else: | |
outstr.append('/') | |
outstr.append(c) | |
state = state.OUTSIDE | |
elif state is State.INSIDE: | |
if c is '*': | |
state = State.FINISH | |
elif state is State.FINISH: | |
if c is '/': | |
state = State.CLOSED | |
else: | |
state = State.INSIDE | |
elif state is State.CLOSED: | |
if c not in ' /\n': | |
state = State.OUTSIDE | |
outstr.append(c) | |
elif c is '/': | |
state = state.START | |
if state is State.START: | |
outstr.append('/') | |
if state is State.INSIDE: | |
raise ValueError("Comment not closed") | |
return ''.join(outstr) | |
strip_comments = strip_C_comments | |
class TestCommentParser(unittest.TestCase): | |
single = "/* comment */" | |
oneline_after = "def method(self): /* do stuff */" | |
oneline_before = "/* comment */ def dostuff(self):" | |
multiline = ('/* big\n' | |
'comment */\n' | |
'def dostuff(self):') | |
multiline2 = ('/* big /*\n' | |
'comment */\n' | |
'def dostuff(self):') | |
ignore_open = '/don\'t strip me bro */' | |
no_close = '/* i never end' | |
nested_comment = '/* i /* start */ and end' | |
two_nested_comments = '/* i /* start */ and end */ twice' | |
many_comments = '/*one*/ /*two*/ notcomment' | |
two_multiline = ('/*\n' | |
'one\n' | |
'*/\n' | |
'/*\n' | |
'two\n' | |
'*/\n' | |
'code' | |
) | |
def test_many_comments(self): | |
self.assertEqual(strip_comments(self.many_comments), 'notcomment') | |
def test_oneline(self): | |
self.assertEqual(strip_comments(self.single), "") | |
def test_oneline_after(self): | |
self.assertEqual(strip_comments(self.oneline_after), 'def method(self): ') | |
def test_oneline_before(self): | |
self.assertEqual(strip_comments(self.oneline_before), 'def dostuff(self):') | |
def test_multiline(self): | |
self.assertEqual(strip_comments(self.multiline), 'def dostuff(self):') | |
def test_multiline2(self): | |
self.assertEqual(strip_comments(self.multiline2), 'def dostuff(self):') | |
def test_two_multiline(self): | |
self.assertEqual(strip_comments(self.two_multiline), 'code') | |
def test_ignore_open(self): | |
self.assertEqual(strip_comments(self.ignore_open), self.ignore_open) | |
def test_no_close(self): | |
with self.assertRaises(ValueError): | |
strip_comments(self.no_close) | |
def test_nested_comment(self): | |
self.assertEqual(strip_comments(self.nested_comment), 'and end') | |
def test_two_nested_comments(self): | |
self.assertEqual(strip_comments(self.two_nested_comments), 'and end */ twice') | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment