Skip to content

Instantly share code, notes, and snippets.

@tmadlener
Created July 9, 2020 16:33
Show Gist options
  • Save tmadlener/6f97cc77aabbe7a6b821b6bd74344c27 to your computer and use it in GitHub Desktop.
Save tmadlener/6f97cc77aabbe7a6b821b6bd74344c27 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Schema definitions for the different parts that can occur in a yaml
datalayout file.
"""
from __future__ import print_function, unicode_literals, absolute_import
# TODO:
# - [x] syntactic checks of format read from yaml file
# - [x] basic checks of types
# - [ ] checks of relations between definitions
import re
from schema import (
Schema, Optional, Use, And, Hook, SchemaError
)
BUILTIN_TYPES = ["int", "long", "float", "double",
"unsigned int", "unsigned", "unsigned long",
"short", "bool", "long long",
"unsigned long long", "std::string"]
class MemberVariable(object):
"""Simple class to hold information about a member variable"""
def __init__(self, **kwargs):
self.name = kwargs.pop('name', '')
self.full_type = kwargs.pop('type', '')
self.description = kwargs.pop('description', '')
is_array = kwargs.pop('is_array', False)
self.array_type = kwargs.pop('array_type', None)
self.array_size = kwargs.pop('array_size', None)
if is_array and not (self.array_type and self.array_size):
array_match = ValidType.array_re.match(self.full_type)
if array_match:
self.array_type, self.array_size = array_match.groups()
else:
raise ValueError("Trying to construct MemberVariable with 'is_array' but 'type' "
"is not a valid array definition")
self.is_array = is_array or (self.array_type and self.array_size)
if self.is_array:
self.full_type = r'std::array<{}, {}>'.format(self.array_type, self.array_size)
if kwargs:
raise ValueError("Unused kwargs in MemberVariable: {}".format(kwargs.keys()))
def __str__(self):
"""string representation"""
definition = r'{} {};'.format(self.full_type, self.name)
if self.description:
definition += r' ///< {}'.format(self.description)
return definition
class ValidName(object):
"""Class that can be used to validate that a given string could potentially be
used as a type name in c++"""
# Names can be almost anything as long as it doesn't start with a digit and
# doesn't contain anything fancy or space
name_re = re.compile(r'([a-zA-Z_]+\w*)')
def validate(self, data):
"""Check if string is a valid name"""
# TODO: More stringent tests
if isinstance(data, str) and len(data) and " " not in data.strip() and self.name_re.match(data.strip()):
return data.strip()
else:
print('{} is not a valid key'.format(data))
raise SchemaError('{n} is not a valid name for a type'.format(n=data))
class ValidType(object):
"""Class that can be used to validate that a given string is a valid c++ typ.
Given the complexity of that decicion we don't even try to get all
possibilities, but simply try to get a subset of everything that is possible
in c++ and simply declare the rest invalid.
"""
# Doing this with regex is non-ideal, but we should be able to at least
# enforce something that will yield valid c++ identifiers even if we might not
# cover all possibilities that are admitted by the c++ standard
# A type can either start with a double colon, or a character (types starting
# with _ are technically allowed, but partially reserved for compilers)
# Additionally we have to take int account the possible whitespaces in the
# builtin types above. Currently this is done by simple brute-forcing
type_str = r'((?:\:{{2}})?[a-zA-Z]+[a-zA-Z0-9:_]*|{builtin_re})'.format(
builtin_re=r'|'.join((r'(?:{})'.format(t)) for t in BUILTIN_TYPES))
type_re = re.compile(type_str)
# std::array declaration with some whitespace distribution freedom
array_re = re.compile(r' *std::array *< *{typ} *, *([0-9]+) *>'.format(typ=type_str))
def __init__(self, allow_arr=True):
""""""
self.allow_arr = allow_arr
def validate(self, data):
"""Check if the passed data (str) could be a valid type"""
if not isinstance(data, str):
raise SchemaError('{} has to be a string in order to be a valid type'.format(data))
type_match = self.type_re.match(data.strip())
if type_match:
return MemberVariable(type=type_match.group(1).strip())
if self.allow_arr:
array_match = self.array_re.match(data.strip())
if array_match:
return MemberVariable(array_type=array_match.group(1), array_size=array_match.group(2))
raise SchemaError('{} is not a valid type name in this context'.format(data))
class ValidMember(object):
"""TODO"""
# Comments can be anything after //
# stripping of trailing whitespaces is done later as it is hard to do with regex
comment_str = r'\/\/ *(.*)'
type_or_array_str = r'(?:{t}|{a})'.format(t=ValidType.type_str, a=ValidType.array_re.pattern)
member_re = re.compile(' *{type} +{name} *{comment}'.format(
type=type_or_array_str, name=ValidName.name_re.pattern, comment=comment_str
))
def __init__(self, allow_arr=True):
self.allow_arr = allow_arr
def validate(self, data):
"""Validate the definition and return a MemberVariable"""
member_match = self.member_re.match(data)
if member_match:
name = member_match.group(4)
description = member_match.group(5).strip()
# Depending on whether group 1 or groups 2 and 3 are present, it is a
# simple type or an array
if member_match.group(1):
return MemberVariable(type=member_match.group(1), name=name, description=description)
else:
if self.allow_arr:
return MemberVariable(array_type=member_match.group(2), array_size=member_match.group(3),
name=name, description=description)
else:
raise SchemaError('{} defines an array type which is not allowd in this context'.format(data))
raise SchemaError('{} does not define a valid member of a datatype'.format(data))
COMPONENT_SCHEMA = Schema({
ValidName(): ValidType(),
Optional('ExtraCode'): {'declaration': str}
})
DATATYPE_SCHEMA = Schema({
'Description': str,
'Author': str,
'Members': [ValidMember()],
Optional('ExtraCode'): { Optional('declaration'): str,
Optional('implementation'): str,
Optional('const_declaration'): str,
Optional('const_implementation'): str,
Optional('includes'): str },
Optional('OneToOneRelations'): [ValidMember(allow_arr=False)],
Optional('OneToManyRelations'): [ValidMember(allow_arr=False)],
Optional('VectorMembers'): [ValidMember(allow_arr=False)]
# TODO (are they in use anywhere?):
# ConstExtraCode
# TransientMembers
# Typedefs
})
if __name__ == '__main__':
# As read in from yaml file
valid_component = {
'x': 'int',
'y': 'int',
'z': 'int',
'p': 'std::array<int, 4>',
'ExtraCode': {'declaration': ' SimpleStruct() : x(0),y(0),z(0) {} SimpleStruct( const int* v) : x(v[0]),y(v[1]),z(v[2]) {} '}
}
# As read in from yaml file
valid_datatype = {
'Description': 'Reconstructed Particle',
'Author': 'F.Gaede, DESY',
'Members': ['int type //type of reconstructed particle. Check/set collection parameters ReconstructedParticleTypeNames and ReconstructedParticleTypeValues.',
'float energy // [GeV] energy of the reconstructed particle.',
'edm4hep::Vector3f momentum // [GeV] particle momentum',
'edm4hep::Vector3f referencePoint // [mm] reference, i.e. where the particle has been measured',
'float charge //charge of the reconstructed particle.',
'float mass // [GeV] mass of the reconstructed particle, set independently from four vector',
'float goodnessOfPID //overall goodness of the PID on a scale of [0;1]',
'std::array<float,10> covMatrix //cvariance matrix of the reconstructed particle 4vector (10 parameters). Stored as lower triangle matrix of the four momentum (px,py,pz,E), i.e. cov(px,px), cov(py,##'],
'OneToOneRelations': ['edm4hep::Vertex startVertex //start vertex associated to this particle',
'edm4hep::ParticleID particleIDUsed //particle Id used for the kinematics of this particle'],
'OneToManyRelations': ['edm4hep::Cluster clusters //clusters that have been used for this particle.',
'edm4hep::Track tracks //tracks that have been used for this particle.',
'edm4hep::ReconstructedParticle particles //reconstructed particles that have been combined to this particle.',
'edm4hep::ParticleID particleIDs //particle Ids (not sorted by their likelihood)'],
'ExtraCode': {'declaration': ' bool isCompound() { return particles_size() > 0 ;}\n //vertex where the particle decays This method actually returns the start vertex from the first daughter particle found.\n //TODO: edm4hep::Vertex getEndVertex() { return edm4hep::Vertex( (getParticles(0).isAvailable() ? getParticles(0).getStartVertex() : edm4hep::Vertex(0,0) ) ) ; }\n '}}
print(COMPONENT_SCHEMA.validate(valid_component))
print(DATATYPE_SCHEMA.validate(valid_datatype))
# inserting an invalid key will lead to a failure with an almost readable error message
valid_component['invalid name'] = 'this does not matter any longer'
COMPONENT_SCHEMA.validate(valid_component)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment