tmadlener/podio_schema_proto.py

## podio_schema_proto.py
#!/usr/bin/env python3
"""
Schema definitions for the different parts that can occur in a yaml
datalayout file.
"""

from __future__ import print_function, unicode_literals, absolute_import

# TODO:
# - [x] syntactic checks of format read from yaml file
# - [x] basic checks of types
# - [ ] checks of relations between definitions

import re

from schema import (
  Schema, Optional, Use, And, Hook, SchemaError
)

BUILTIN_TYPES = ["int", "long", "float", "double",
                 "unsigned int", "unsigned", "unsigned long",
                 "short", "bool", "long long",
                 "unsigned long long", "std::string"]

class MemberVariable(object):
  """Simple class to hold information about a member variable"""
  def __init__(self, **kwargs):
    self.name = kwargs.pop('name', '')
    self.full_type = kwargs.pop('type', '')
    self.description = kwargs.pop('description', '')

    is_array = kwargs.pop('is_array', False)
    self.array_type = kwargs.pop('array_type', None)
    self.array_size = kwargs.pop('array_size', None)
    if is_array and not (self.array_type and self.array_size):
      array_match = ValidType.array_re.match(self.full_type)
      if array_match:
        self.array_type, self.array_size = array_match.groups()
      else:
        raise ValueError("Trying to construct MemberVariable with 'is_array' but 'type' "
                         "is not a valid array definition")

    self.is_array = is_array or (self.array_type and self.array_size)
    if self.is_array:
      self.full_type = r'std::array<{}, {}>'.format(self.array_type, self.array_size)

    if kwargs:
      raise ValueError("Unused kwargs in MemberVariable: {}".format(kwargs.keys()))


  def __str__(self):
    """string representation"""
    definition = r'{} {};'.format(self.full_type, self.name)
    if self.description:
      definition += r' ///< {}'.format(self.description)
    return definition


class ValidName(object):
  """Class that can be used to validate that a given string could potentially be
  used as a type name in c++"""
  # Names can be almost anything as long as it doesn't start with a digit and
  # doesn't contain anything fancy or space
  name_re = re.compile(r'([a-zA-Z_]+\w*)')

  def validate(self, data):
    """Check if string is a valid name"""
    # TODO: More stringent tests
    if isinstance(data, str) and len(data) and " " not in data.strip() and self.name_re.match(data.strip()):
      return data.strip()
    else:
      print('{} is not a valid key'.format(data))
      raise SchemaError('{n} is not a valid name for a type'.format(n=data))


class ValidType(object):
  """Class that can be used to validate that a given string is a valid c++ typ.
  Given the complexity of that decicion we don't even try to get all
  possibilities, but simply try to get a subset of everything that is possible
  in c++ and simply declare the rest invalid.
  """
  # Doing this with regex is non-ideal, but we should be able to at least
  # enforce something that will yield valid c++ identifiers even if we might not
  # cover all possibilities that are admitted by the c++ standard

  # A type can either start with a double colon, or a character (types starting
  # with _ are technically allowed, but partially reserved for compilers)
  # Additionally we have to take int account the possible whitespaces in the
  # builtin types above. Currently this is done by simple brute-forcing
  type_str = r'((?:\:{{2}})?[a-zA-Z]+[a-zA-Z0-9:_]*|{builtin_re})'.format(
      builtin_re=r'|'.join((r'(?:{})'.format(t)) for t in BUILTIN_TYPES))

  type_re = re.compile(type_str)

  # std::array declaration with some whitespace distribution freedom
  array_re = re.compile(r' *std::array *< *{typ} *, *([0-9]+) *>'.format(typ=type_str))

  def __init__(self, allow_arr=True):
    """"""
    self.allow_arr = allow_arr


  def validate(self, data):
    """Check if the passed data (str) could be a valid type"""
    if not isinstance(data, str):
      raise SchemaError('{} has to be a string in order to be a valid type'.format(data))

    type_match = self.type_re.match(data.strip())
    if type_match:
      return MemberVariable(type=type_match.group(1).strip())

    if self.allow_arr:
      array_match = self.array_re.match(data.strip())
      if array_match:
        return MemberVariable(array_type=array_match.group(1), array_size=array_match.group(2))

    raise SchemaError('{} is not a valid type name in this context'.format(data))


class ValidMember(object):
  """TODO"""
  # Comments can be anything after //
  # stripping of trailing whitespaces is done later as it is hard to do with regex
  comment_str = r'\/\/ *(.*)'
  type_or_array_str = r'(?:{t}|{a})'.format(t=ValidType.type_str, a=ValidType.array_re.pattern)

  member_re = re.compile(' *{type} +{name} *{comment}'.format(
      type=type_or_array_str, name=ValidName.name_re.pattern, comment=comment_str
  ))

  def __init__(self, allow_arr=True):
    self.allow_arr = allow_arr

  def validate(self, data):
    """Validate the definition and return a MemberVariable"""
    member_match = self.member_re.match(data)
    if member_match:
      name = member_match.group(4)
      description = member_match.group(5).strip()

      # Depending on whether group 1 or groups 2 and 3 are present, it is a
      # simple type or an array
      if member_match.group(1):
        return MemberVariable(type=member_match.group(1), name=name, description=description)
      else:
        if self.allow_arr:
          return MemberVariable(array_type=member_match.group(2), array_size=member_match.group(3),
                                name=name, description=description)
        else:
          raise SchemaError('{} defines an array type which is not allowd in this context'.format(data))

    raise SchemaError('{} does not define a valid member of a datatype'.format(data))


COMPONENT_SCHEMA = Schema({
    ValidName(): ValidType(),
    Optional('ExtraCode'): {'declaration': str}
})

DATATYPE_SCHEMA = Schema({
    'Description': str,
    'Author': str,
    'Members': [ValidMember()],
    Optional('ExtraCode'): { Optional('declaration'): str,
                             Optional('implementation'): str,
                             Optional('const_declaration'): str,
                             Optional('const_implementation'): str,
                             Optional('includes'): str },
    Optional('OneToOneRelations'): [ValidMember(allow_arr=False)],
    Optional('OneToManyRelations'): [ValidMember(allow_arr=False)],
    Optional('VectorMembers'): [ValidMember(allow_arr=False)]
    # TODO (are they in use anywhere?):
    # ConstExtraCode
    # TransientMembers
    # Typedefs
})


if __name__ == '__main__':

  # As read in from yaml file
  valid_component = {
    'x': 'int',
    'y': 'int',
    'z': 'int',
    'p': 'std::array<int, 4>',
     'ExtraCode': {'declaration': ' SimpleStruct() : x(0),y(0),z(0) {} SimpleStruct( const int* v) : x(v[0]),y(v[1]),z(v[2]) {} '}
  }

  # As read in from yaml file
  valid_datatype = {
    'Description': 'Reconstructed Particle',
    'Author': 'F.Gaede, DESY',
    'Members': ['int                    type           //type of reconstructed particle. Check/set collection parameters ReconstructedParticleTypeNames and ReconstructedParticleTypeValues.',
                'float                  energy         // [GeV] energy of the reconstructed particle.',
                'edm4hep::Vector3f      momentum       // [GeV] particle momentum',
                'edm4hep::Vector3f      referencePoint // [mm] reference, i.e. where the particle has been measured',
                'float                  charge         //charge of the reconstructed particle.',
                'float                  mass           // [GeV] mass of the reconstructed particle, set independently from four vector',
                'float                  goodnessOfPID  //overall goodness of the PID on a scale of [0;1]',
                'std::array<float,10>   covMatrix      //cvariance matrix of the reconstructed particle 4vector (10 parameters). Stored as lower triangle matrix of the four momentum (px,py,pz,E), i.e. cov(px,px), cov(py,##'],
    'OneToOneRelations': ['edm4hep::Vertex          startVertex    //start vertex associated to this particle',
                          'edm4hep::ParticleID      particleIDUsed //particle Id used for the kinematics of this particle'],
    'OneToManyRelations': ['edm4hep::Cluster               clusters     //clusters that have been used for this particle.',
                           'edm4hep::Track                 tracks       //tracks that have been used for this particle.',
                           'edm4hep::ReconstructedParticle particles    //reconstructed particles that have been combined to this particle.',
                           'edm4hep::ParticleID            particleIDs  //particle Ids (not sorted by their likelihood)'],
    'ExtraCode': {'declaration': ' bool isCompound() { return particles_size() > 0 ;}\n //vertex where the particle decays This method actually returns the start vertex from the first daughter particle found.\n //TODO: edm4hep::Vertex  getEndVertex() { return  edm4hep::Vertex(  (getParticles(0).isAvailable() ? getParticles(0).getStartVertex() :  edm4hep::Vertex(0,0) ) ) ; }\n '}}


  print(COMPONENT_SCHEMA.validate(valid_component))

  print(DATATYPE_SCHEMA.validate(valid_datatype))

  # inserting an invalid key will lead to a failure with an almost readable error message
  valid_component['invalid name'] = 'this does not matter any longer'
  COMPONENT_SCHEMA.validate(valid_component)
	#!/usr/bin/env python3
	"""
	Schema definitions for the different parts that can occur in a yaml
	datalayout file.
	"""

	from __future__ import print_function, unicode_literals, absolute_import

	# TODO:
	# - [x] syntactic checks of format read from yaml file
	# - [x] basic checks of types
	# - [ ] checks of relations between definitions

	import re

	from schema import (
	Schema, Optional, Use, And, Hook, SchemaError
	)

	BUILTIN_TYPES = ["int", "long", "float", "double",
	"unsigned int", "unsigned", "unsigned long",
	"short", "bool", "long long",
	"unsigned long long", "std::string"]

	class MemberVariable(object):
	"""Simple class to hold information about a member variable"""
	def __init__(self, **kwargs):
	self.name = kwargs.pop('name', '')
	self.full_type = kwargs.pop('type', '')
	self.description = kwargs.pop('description', '')

	is_array = kwargs.pop('is_array', False)
	self.array_type = kwargs.pop('array_type', None)
	self.array_size = kwargs.pop('array_size', None)
	if is_array and not (self.array_type and self.array_size):
	array_match = ValidType.array_re.match(self.full_type)
	if array_match:
	self.array_type, self.array_size = array_match.groups()
	else:
	raise ValueError("Trying to construct MemberVariable with 'is_array' but 'type' "
	"is not a valid array definition")

	self.is_array = is_array or (self.array_type and self.array_size)
	if self.is_array:
	self.full_type = r'std::array<{}, {}>'.format(self.array_type, self.array_size)

	if kwargs:
	raise ValueError("Unused kwargs in MemberVariable: {}".format(kwargs.keys()))


	def __str__(self):
	"""string representation"""
	definition = r'{} {};'.format(self.full_type, self.name)
	if self.description:
	definition += r' ///< {}'.format(self.description)
	return definition


	class ValidName(object):
	"""Class that can be used to validate that a given string could potentially be
	used as a type name in c++"""
	# Names can be almost anything as long as it doesn't start with a digit and
	# doesn't contain anything fancy or space
	name_re = re.compile(r'([a-zA-Z_]+\w*)')

	def validate(self, data):
	"""Check if string is a valid name"""
	# TODO: More stringent tests
	if isinstance(data, str) and len(data) and " " not in data.strip() and self.name_re.match(data.strip()):
	return data.strip()
	else:
	print('{} is not a valid key'.format(data))
	raise SchemaError('{n} is not a valid name for a type'.format(n=data))


	class ValidType(object):
	"""Class that can be used to validate that a given string is a valid c++ typ.
	Given the complexity of that decicion we don't even try to get all
	possibilities, but simply try to get a subset of everything that is possible
	in c++ and simply declare the rest invalid.
	"""
	# Doing this with regex is non-ideal, but we should be able to at least
	# enforce something that will yield valid c++ identifiers even if we might not
	# cover all possibilities that are admitted by the c++ standard

	# A type can either start with a double colon, or a character (types starting
	# with _ are technically allowed, but partially reserved for compilers)
	# Additionally we have to take int account the possible whitespaces in the
	# builtin types above. Currently this is done by simple brute-forcing
	type_str = r'((?:\:{{2}})?[a-zA-Z]+[a-zA-Z0-9:_]*\|{builtin_re})'.format(
	builtin_re=r'\|'.join((r'(?:{})'.format(t)) for t in BUILTIN_TYPES))

	type_re = re.compile(type_str)

	# std::array declaration with some whitespace distribution freedom
	array_re = re.compile(r' std::array < {typ} , ([0-9]+) >'.format(typ=type_str))

	def __init__(self, allow_arr=True):
	""""""
	self.allow_arr = allow_arr


	def validate(self, data):
	"""Check if the passed data (str) could be a valid type"""
	if not isinstance(data, str):
	raise SchemaError('{} has to be a string in order to be a valid type'.format(data))

	type_match = self.type_re.match(data.strip())
	if type_match:
	return MemberVariable(type=type_match.group(1).strip())

	if self.allow_arr:
	array_match = self.array_re.match(data.strip())
	if array_match:
	return MemberVariable(array_type=array_match.group(1), array_size=array_match.group(2))

	raise SchemaError('{} is not a valid type name in this context'.format(data))


	class ValidMember(object):
	"""TODO"""
	# Comments can be anything after //
	# stripping of trailing whitespaces is done later as it is hard to do with regex
	comment_str = r'\/\/ (.)'
	type_or_array_str = r'(?:{t}\|{a})'.format(t=ValidType.type_str, a=ValidType.array_re.pattern)

	member_re = re.compile(' {type} +{name} {comment}'.format(
	type=type_or_array_str, name=ValidName.name_re.pattern, comment=comment_str
	))

	def __init__(self, allow_arr=True):
	self.allow_arr = allow_arr

	def validate(self, data):
	"""Validate the definition and return a MemberVariable"""
	member_match = self.member_re.match(data)
	if member_match:
	name = member_match.group(4)
	description = member_match.group(5).strip()

	# Depending on whether group 1 or groups 2 and 3 are present, it is a
	# simple type or an array
	if member_match.group(1):
	return MemberVariable(type=member_match.group(1), name=name, description=description)
	else:
	if self.allow_arr:
	return MemberVariable(array_type=member_match.group(2), array_size=member_match.group(3),
	name=name, description=description)
	else:
	raise SchemaError('{} defines an array type which is not allowd in this context'.format(data))

	raise SchemaError('{} does not define a valid member of a datatype'.format(data))


	COMPONENT_SCHEMA = Schema({
	ValidName(): ValidType(),
	Optional('ExtraCode'): {'declaration': str}
	})

	DATATYPE_SCHEMA = Schema({
	'Description': str,
	'Author': str,
	'Members': [ValidMember()],
	Optional('ExtraCode'): { Optional('declaration'): str,
	Optional('implementation'): str,
	Optional('const_declaration'): str,
	Optional('const_implementation'): str,
	Optional('includes'): str },
	Optional('OneToOneRelations'): [ValidMember(allow_arr=False)],
	Optional('OneToManyRelations'): [ValidMember(allow_arr=False)],
	Optional('VectorMembers'): [ValidMember(allow_arr=False)]
	# TODO (are they in use anywhere?):
	# ConstExtraCode
	# TransientMembers
	# Typedefs
	})




	if __name__ == '__main__':

	# As read in from yaml file
	valid_component = {
	'x': 'int',
	'y': 'int',
	'z': 'int',
	'p': 'std::array<int, 4>',
	'ExtraCode': {'declaration': ' SimpleStruct() : x(0),y(0),z(0) {} SimpleStruct( const int* v) : x(v[0]),y(v[1]),z(v[2]) {} '}
	}

	# As read in from yaml file
	valid_datatype = {
	'Description': 'Reconstructed Particle',
	'Author': 'F.Gaede, DESY',
	'Members': ['int type //type of reconstructed particle. Check/set collection parameters ReconstructedParticleTypeNames and ReconstructedParticleTypeValues.',
	'float energy // [GeV] energy of the reconstructed particle.',
	'edm4hep::Vector3f momentum // [GeV] particle momentum',
	'edm4hep::Vector3f referencePoint // [mm] reference, i.e. where the particle has been measured',
	'float charge //charge of the reconstructed particle.',
	'float mass // [GeV] mass of the reconstructed particle, set independently from four vector',
	'float goodnessOfPID //overall goodness of the PID on a scale of [0;1]',
	'std::array<float,10> covMatrix //cvariance matrix of the reconstructed particle 4vector (10 parameters). Stored as lower triangle matrix of the four momentum (px,py,pz,E), i.e. cov(px,px), cov(py,##'],
	'OneToOneRelations': ['edm4hep::Vertex startVertex //start vertex associated to this particle',
	'edm4hep::ParticleID particleIDUsed //particle Id used for the kinematics of this particle'],
	'OneToManyRelations': ['edm4hep::Cluster clusters //clusters that have been used for this particle.',
	'edm4hep::Track tracks //tracks that have been used for this particle.',
	'edm4hep::ReconstructedParticle particles //reconstructed particles that have been combined to this particle.',
	'edm4hep::ParticleID particleIDs //particle Ids (not sorted by their likelihood)'],
	'ExtraCode': {'declaration': ' bool isCompound() { return particles_size() > 0 ;}\n //vertex where the particle decays This method actually returns the start vertex from the first daughter particle found.\n //TODO: edm4hep::Vertex getEndVertex() { return edm4hep::Vertex( (getParticles(0).isAvailable() ? getParticles(0).getStartVertex() : edm4hep::Vertex(0,0) ) ) ; }\n '}}


	print(COMPONENT_SCHEMA.validate(valid_component))

	print(DATATYPE_SCHEMA.validate(valid_datatype))

	# inserting an invalid key will lead to a failure with an almost readable error message
	valid_component['invalid name'] = 'this does not matter any longer'
	COMPONENT_SCHEMA.validate(valid_component)