Skip to content

Instantly share code, notes, and snippets.

@dsoprea
Last active January 13, 2024 09:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dsoprea/fd553515aa0abc69c1d72a0ce3c50d71 to your computer and use it in GitHub Desktop.
Save dsoprea/fd553515aa0abc69c1d72a0ce3c50d71 to your computer and use it in GitHub Desktop.
Assert Uniqueness In YAML dictionaries using PyYAML
import yaml
def load_and_assert_uniqueness(x):
# We'd like to detect duplicates. Since PyYAML both loads things depth-first
# *and* doesn't give us the parent when processing a child node, we'll index
# of all of the object IDs as we're constructing them, and then see which
# are disappeared from the final hierarchy. Since all we can do is pass a
# class, we need to inline the class in order to load into an index within
# our scope.
#
# We're only concerned about dictionary keys with dictionary values because
# a) this is our use-case, and b) we can stash additional information as
# dictionary keys without having to override any types.
nodes_by_id = {}
class _UniqueCheckedLoader(yaml.SafeLoader):
def construct_yaml_map(self, node):
data = {}
id_ = id(data)
data['_id'] = id_
nodes_by_id[id_] = data
yield data
value = self.construct_mapping(node)
data.update(value)
_UniqueCheckedLoader.add_constructor(
'tag:yaml.org,2002:map',
_UniqueCheckedLoader.construct_yaml_map
)
# Load
blob = yaml.load(x, Loader=_UniqueCheckedLoader)
# Remove all nodes in the final dictionary from the by-ID index
q = [blob]
while q:
d, q = q[0], q[1:]
id_ = d.pop('_id')
del nodes_by_id[id_]
for v in d.values():
# We're only concern with dictionary nodes
if v.__class__ is not dict:
continue
q.append(v)
# We've visited all referencesd nodes. Everything still indexed must've been
# pruned due to nonuniqueness. As mentioned above, we really don't have any
# hierarchical context, by we can just search out occurrences of the
# attributes from the node(s) in the data in order to find the duplicates.
if nodes_by_id:
# Cleanup representation before displaying
nodes = []
for node in nodes_by_id.values():
del node['_id']
nodes.append(node)
# Error out
raise \
Exception(
"({}) nodes were duplicates:\n{}".format(
len(nodes), nodes))
return blob
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment