Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active June 4, 2022 13:32
Show Gist options
  • Save xflr6/da45b87485626477406078fec54ea67a to your computer and use it in GitHub Desktop.
Save xflr6/da45b87485626477406078fec54ea67a to your computer and use it in GitHub Desktop.
Add missing autotyp variable N.levels information from metadata_overview.csv to metadata/*.yaml files
"""Insert missing https://www.autotyp.uzh.ch N.levels from overview into metadata files
see https://github.com/autotyp/autotyp-data/pull/7
"""
import csv
import operator
import pathlib
import regex
import yaml
OVERVIEW = pathlib.Path('metadata_overview.csv')
METADATA = sorted(pathlib.Path('metadata').glob('*.yaml'))
ENCODING = 'utf-8'
VARIANT = regex.compile(r'\G'
r'^#.+\n'
r'^(?P<variable>[\w.]+).+\n'
r'(?P<item>'
r'^ {2}(?P<key>[\w.]+).+\n'
r'(?:^ {4}.+\n)*'
r')+'
r'(?:^\n)+', flags=regex.MULTILINE)
def iternlevels(filepath: pathlib.Path = OVERVIEW, *,
encoding: str = ENCODING):
keyfields, valuefield = ['Module', 'Variable'], 'N.levels'
with filepath.open(encoding=encoding) as f:
reader = csv.reader(f)
header = next(reader)
get_key = operator.itemgetter(*(header.index(f) for f in keyfields))
get_value = operator.itemgetter(header.index(valuefield))
for row in reader:
yield get_key(row), int(get_value(row))
def insert_nlevels(text: str, get_nlevels, *,
after: str = 'VariantOf',
pattern: regex.Pattern = VARIANT) -> str:
result = []
for ma in pattern.finditer(text):
keys = ma.captures('key')
if 'N.levels' in keys:
result.append(ma.group())
else:
after_end = ma.ends('item')[keys.index(after)]
n_levels = get_nlevels(ma.group('variable'))
result += [text[ma.start():after_end],
f' N.levels : {n_levels:d}\n',
text[after_end:ma.end()]]
assert ma.end() == len(text)
return ''.join(result)
nlevels = dict(iternlevels())
for m in METADATA:
module = m.stem
with m.open('rb') as f:
doc = yaml.load(f)
missing = False
for variable, d in doc.items():
if 'N.levels' in d:
assert d['N.levels'] == nlevels[module, variable]
else:
missing = True
if missing: # use regex to preserve yaml formatting, comments, etc.
text = m.read_text(encoding=ENCODING)
text = insert_nlevels(text, lambda v: nlevels[module, v])
m.write_text(text, encoding=ENCODING)
with m.open('rb') as f:
doc = yaml.load(f)
assert all(d['N.levels'] == nlevels[module, v] for v, d in doc.items())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment