Skip to content

Instantly share code, notes, and snippets.

@jaysoffian
Last active December 16, 2015 03:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaysoffian/5368435 to your computer and use it in GitHub Desktop.
Save jaysoffian/5368435 to your computer and use it in GitHub Desktop.
Quick and dirty Objective-C tokenizer in Python
import re
from collections import deque, namedtuple
# Tokenize per the C preprocessor, more or less...
TOKENIZER = re.compile(r'''
(?P<WHITESPACE>\s+) |
(?P<COMMENT>(?://[^\n]* | /\*.*?\*/)) |
(?P<PREP>\#\s*[a-z][^\n]*) |
(?P<OBJC>@[a-z]+) |
(?P<NUMBER>(?:\.?\d(?:[eEpP][+-]|[a-zA-Z0-9_.])*)) |
(?P<STRING>[L@]?"(?:[^"\n\\]|\\.)*") |
(?P<CHAR>L?'(?:[^'\\]|\\.)') |
(?P<OP>(?:
@(?:YES|NO|[{(\[]) | ^{ |
\.\.\. | <<= | >>= | != | \#\# | %= | && | &= | \*= |
\+\+ | \+= | -- | -= | -> | /= | << | <= | == |
\|\| | >= | >> | ^= | \|= |
[][!#%&()*+,-./:;<=>?^{|}~]
)) |
(?P<IDENT>[A-Za-z_][A-Za-z0-9_]*) |
(?P<OTHER>.)
''', re.DOTALL | re.VERBOSE)
BRACKETS = {
'[':']', '{':'}', '(':')', '@[':']', '@{':'}', '@(':')', '^{':'}'
}
CLOSE_BRACKETS = BRACKETS.values()
Token = namedtuple("Token", "key val lnum")
def replace_trigraphs(text):
def repl(match):
return {
'(':'[', ')':']', '<':'{', '>':'}',
"'":'^', '!':'|', '-':'~', '=':'#', '/':'\\',
}[match.group(0)[-1]]
return re.sub(r"\?\?[()<>/'!-=]", repl, text)
def splice_lines(text):
line_iter = enumerate(text.splitlines(), 1)
lines = []
line_nums = []
for line_num, line in line_iter:
line_nums.append(line_num)
while line.endswith('\\'):
line = line[:-1] + line_iter.next()[1]
lines.append(line)
return '\n'.join(lines), line_nums
def tokenize(text):
text, line_nums = splice_lines(replace_trigraphs(text))
num_nls = 0
for match in re.finditer(TOKENIZER, text):
items = [(k, v) for (k, v) in match.groupdict().items()
if v is not None]
assert len(items) == 1
key, val = items[0]
if key != 'WHITESPACE':
yield Token(key, val.replace('\\\n', ''), line_nums[num_nls])
num_nls += val.count('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment