Last active
June 4, 2022 13:32
-
-
Save xflr6/da45b87485626477406078fec54ea67a to your computer and use it in GitHub Desktop.
Add missing autotyp variable N.levels information from metadata_overview.csv to metadata/*.yaml files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Insert missing https://www.autotyp.uzh.ch N.levels from overview into metadata files | |
see https://github.com/autotyp/autotyp-data/pull/7 | |
""" | |
import csv | |
import operator | |
import pathlib | |
import regex | |
import yaml | |
OVERVIEW = pathlib.Path('metadata_overview.csv') | |
METADATA = sorted(pathlib.Path('metadata').glob('*.yaml')) | |
ENCODING = 'utf-8' | |
VARIANT = regex.compile(r'\G' | |
r'^#.+\n' | |
r'^(?P<variable>[\w.]+).+\n' | |
r'(?P<item>' | |
r'^ {2}(?P<key>[\w.]+).+\n' | |
r'(?:^ {4}.+\n)*' | |
r')+' | |
r'(?:^\n)+', flags=regex.MULTILINE) | |
def iternlevels(filepath: pathlib.Path = OVERVIEW, *, | |
encoding: str = ENCODING): | |
keyfields, valuefield = ['Module', 'Variable'], 'N.levels' | |
with filepath.open(encoding=encoding) as f: | |
reader = csv.reader(f) | |
header = next(reader) | |
get_key = operator.itemgetter(*(header.index(f) for f in keyfields)) | |
get_value = operator.itemgetter(header.index(valuefield)) | |
for row in reader: | |
yield get_key(row), int(get_value(row)) | |
def insert_nlevels(text: str, get_nlevels, *, | |
after: str = 'VariantOf', | |
pattern: regex.Pattern = VARIANT) -> str: | |
result = [] | |
for ma in pattern.finditer(text): | |
keys = ma.captures('key') | |
if 'N.levels' in keys: | |
result.append(ma.group()) | |
else: | |
after_end = ma.ends('item')[keys.index(after)] | |
n_levels = get_nlevels(ma.group('variable')) | |
result += [text[ma.start():after_end], | |
f' N.levels : {n_levels:d}\n', | |
text[after_end:ma.end()]] | |
assert ma.end() == len(text) | |
return ''.join(result) | |
nlevels = dict(iternlevels()) | |
for m in METADATA: | |
module = m.stem | |
with m.open('rb') as f: | |
doc = yaml.load(f) | |
missing = False | |
for variable, d in doc.items(): | |
if 'N.levels' in d: | |
assert d['N.levels'] == nlevels[module, variable] | |
else: | |
missing = True | |
if missing: # use regex to preserve yaml formatting, comments, etc. | |
text = m.read_text(encoding=ENCODING) | |
text = insert_nlevels(text, lambda v: nlevels[module, v]) | |
m.write_text(text, encoding=ENCODING) | |
with m.open('rb') as f: | |
doc = yaml.load(f) | |
assert all(d['N.levels'] == nlevels[module, v] for v, d in doc.items()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment