Skip to content

Instantly share code, notes, and snippets.

@aGHz
Created November 28, 2020 16:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aGHz/5e7091d07442ec6e72731813576a6f77 to your computer and use it in GitHub Desktop.
Save aGHz/5e7091d07442ec6e72731813576a6f77 to your computer and use it in GitHub Desktop.
Parse vim syntax scripts
import json
import pprint
import re
import sys
TOK = re.compile(r'\s+')
SYN_ARGS = [
'conceal', 'concealends',
'display', 'transparent', 'oneline', 'fold',
'contained', 'keepend', 'extend',
'skipwhite', 'skipnl', 'skipempty',
# Bad:
'skipwhitecontained', # lex.vim
]
SYN_OPTS = [
'contains', 'containedin', 'nextgroup', 'cchar',
'add', 'remove',
# Bad:
'nextGroup', # make.vim
]
SYN_PATTERN_ARGS = ['excludenl']
SYN_PATTERN_OPTS = [
'matchgroup',
# Bad:
'matchGroup', # sh.vim
]
def syn_args_match(a, b):
for arg in SYN_ARGS:
if a.get(arg, None) != b.get(arg, None):
return False
for opt in SYN_OPTS:
if a.get(opt, None) != b.get(opt, None):
return False
return True
def parse_re(s, tokens):
re_limit = s[0]
while not re.search(r'(?:\\\\|[^\\])' + '\\' + re_limit, s):
# There was a space in the regex, pull in the next token
# !!! Might have been some other whitespace character but we don't care
try:
s += ' ' + tokens.pop(0)
except Exception as e:
print(f'Regex: "{s}"')
raise e
re_end = s.rindex(re_limit)
pattern = s[1:re_end].replace('\\' + re_limit, re_limit)
result = {'pattern': pattern}
re_options = s[re_end+1:]
if re_options:
options = {}
for opt in re_options.split(','):
(k, _, v) = opt.partition('=')
options[k] = v
result['options'] = options
return result
def error(s):
print(s, file=sys.stderr, flush=True)
def load_syn(f):
prev_line = ''
syntax = {
'keyword': {},
'match': {},
'region': {},
'cluster': {}
}
def parse_prev_line():
nonlocal prev_line
nonlocal syntax
if not prev_line:
return
tokens = TOK.split(prev_line)[1:]
kind = tokens.pop(0)
if kind not in syntax:
error(f'Unhandled syntax subcommand {kind}')
prev_line = ''
return
is_keyword = kind == 'keyword'
is_match = kind == 'match'
is_region = kind == 'region'
is_cluster = kind == 'cluster'
name = tokens.pop(0)
spec = {}
if is_keyword:
spec['keywords'] = []
elif is_region:
spec['start'] = []
spec['end'] = []
# Parse the tokens in the line
pattern_args = []
pattern_opts = {}
seen_match = False
while True:
try:
token = tokens.pop(0)
except IndexError:
break
if token in SYN_ARGS:
spec[token] = True
continue
(opt, _, val) = token.partition('=')
if '=' in token and opt in SYN_OPTS:
if not val:
# Sometimes people like writing `start= /re/`
try:
val = tokens.pop(0)
except Exception as e:
print(prev_line)
raise e
while val.endswith(',') and tokens:
val += tokens.pop(0)
spec[opt] = val.split(',') # !!! wrong for cchar=,
continue
if is_keyword:
if token.startswith('"'):
# Comment to the end of the line
break
spec['keywords'].append(token)
elif is_match:
if token in SYN_PATTERN_ARGS:
pattern_args.append(token)
continue
# no pattern opts for match
if seen_match:
break
try:
regex = parse_re(token, tokens)
except Exception as e:
print(prev_line)
raise e
for arg in pattern_args:
regex[arg] = True
spec['match'] = regex
seen_match = True
elif is_region:
if token.startswith('"'):
# Comment to the end of the line
break
if token in SYN_PATTERN_ARGS:
pattern_args.append(token)
continue
if not val:
# Sometimes people like writing `start= /re/`
val = tokens.pop(0)
if opt in SYN_PATTERN_OPTS:
if val == 'NONE':
pattern_opts.pop(opt, None)
continue
pattern_opts[opt] = val
continue
# opt is one of start, skip or end
try:
regex = parse_re(val, tokens)
except Exception as e:
print(prev_line)
print(tokens)
raise e
for arg in pattern_args:
regex[arg] = True
for k, v in pattern_opts.items():
regex[k] = v
spec[opt] = regex
elif is_cluster:
pass # nothing to do, {contains,add,remove} handled by SYN_OPTS
# Save the parsed syntax spec
if is_cluster:
members = set(spec.get('contains') or syntax[kind].get(name, []))
members |= set(spec.get('add', []))
members -= set(spec.get('remove', []))
syntax[kind][name] = list(members)
elif name in syntax[kind]:
if is_keyword:
found = False
for prev_spec in syntax[kind][name]:
if syn_args_match(spec, prev_spec):
found = True
prev_spec['keywords'] += spec['keywords']
break
if not found:
syntax[kind][name].append(spec)
elif is_match:
syntax[kind][name].append(spec)
elif is_region:
syntax[kind][name].append(spec)
else:
syntax[kind][name] = [spec]
prev_line = ''
def handle_line(line):
nonlocal prev_line
# Sometimes people like writing `opt = val`
# !!! Might change regexes but we don't care
content = line.strip().replace(' = ', '=')
# Empty line
if not content:
return
# Comment
if content[0] == '"':
return
# Line continuation
if content[0] == '\\':
if prev_line:
if not prev_line.endswith(','):
prev_line += ' '
prev_line += content[1:].strip()
return
parse_prev_line()
if content.startswith('syn'):
prev_line = content
with open(f) as fp:
for line in fp:
handle_line(line)
parse_prev_line()
return syntax
def dereference_clusters(syntax):
def expand_cluster(name):
nonlocal syntax
expanded = True
while expanded:
expanded = False
for cluster, members in syntax['cluster'].items():
refs = set(c[1:] for c in members if c.startswith('@'))
if not refs:
continue
expanded = True
result = set(g for g in members if not g.startswith('@'))
for c in refs:
if c not in syntax['cluster']:
continue
result |= set(syntax['cluster'][c])
syntax['cluster'][cluster] = list(sorted(result))
for kind in ['match', 'region']:
for _, ms in syntax[kind].items():
for m in ms:
members = m.get('contains')
if not members:
continue
refs = set(c[1:] for c in members if c.startswith('@'))
if not refs:
continue
expanded = True
result = set(g for g in members if not g.startswith('@'))
for c in refs:
if c not in syntax['cluster']:
continue
result |= set(syntax['cluster'][c])
m['contains'] = list(sorted(result))
def main(*files):
for f in files:
syntax = load_syn(f)
dereference_clusters(syntax)
# print(json.dumps(syntax, indent=2, ensure_ascii=False))
for matchname, matches in syntax['match'].items():
for match in matches:
members = match.get('contains')
if not members:
continue
for member in members:
if member not in syntax['region']:
continue
if any(r for r in syntax['region'][member] if r.get('transparent')):
print(f'{f}: match {matchname} contains transparent region {member}')
# for regionname, regiones in syntax['region'].items():
# for region in regiones:
# members = region.get('contains')
# if not members:
# continue
# for member in members:
# if member not in syntax['region']:
# continue
# if any(r for r in syntax['region'][member] if r.get('transparent')):
# print(f'{f}: region {regionname} contains transparent region {member}')
# python syntree.py *.vim 2>/dev/null
if __name__ == '__main__':
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment