Last active
January 13, 2024 09:32
-
-
Save dsoprea/fd553515aa0abc69c1d72a0ce3c50d71 to your computer and use it in GitHub Desktop.
Assert Uniqueness In YAML dictionaries using PyYAML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import yaml | |
def load_and_assert_uniqueness(x): | |
# We'd like to detect duplicates. Since PyYAML both loads things depth-first | |
# *and* doesn't give us the parent when processing a child node, we'll index | |
# of all of the object IDs as we're constructing them, and then see which | |
# are disappeared from the final hierarchy. Since all we can do is pass a | |
# class, we need to inline the class in order to load into an index within | |
# our scope. | |
# | |
# We're only concerned about dictionary keys with dictionary values because | |
# a) this is our use-case, and b) we can stash additional information as | |
# dictionary keys without having to override any types. | |
nodes_by_id = {} | |
class _UniqueCheckedLoader(yaml.SafeLoader): | |
def construct_yaml_map(self, node): | |
data = {} | |
id_ = id(data) | |
data['_id'] = id_ | |
nodes_by_id[id_] = data | |
yield data | |
value = self.construct_mapping(node) | |
data.update(value) | |
_UniqueCheckedLoader.add_constructor( | |
'tag:yaml.org,2002:map', | |
_UniqueCheckedLoader.construct_yaml_map | |
) | |
# Load | |
blob = yaml.load(x, Loader=_UniqueCheckedLoader) | |
# Remove all nodes in the final dictionary from the by-ID index | |
q = [blob] | |
while q: | |
d, q = q[0], q[1:] | |
id_ = d.pop('_id') | |
del nodes_by_id[id_] | |
for v in d.values(): | |
# We're only concern with dictionary nodes | |
if v.__class__ is not dict: | |
continue | |
q.append(v) | |
# We've visited all referencesd nodes. Everything still indexed must've been | |
# pruned due to nonuniqueness. As mentioned above, we really don't have any | |
# hierarchical context, by we can just search out occurrences of the | |
# attributes from the node(s) in the data in order to find the duplicates. | |
if nodes_by_id: | |
# Cleanup representation before displaying | |
nodes = [] | |
for node in nodes_by_id.values(): | |
del node['_id'] | |
nodes.append(node) | |
# Error out | |
raise \ | |
Exception( | |
"({}) nodes were duplicates:\n{}".format( | |
len(nodes), nodes)) | |
return blob |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment