Skip to content

Instantly share code, notes, and snippets.

@Zsailer
Last active May 11, 2024 15:17
Show Gist options
  • Save Zsailer/6da0dc3c97ec873685b7fe58e52d36d7 to your computer and use it in GitHub Desktop.
Save Zsailer/6da0dc3c97ec873685b7fe58e52d36d7 to your computer and use it in GitHub Desktop.
Define and validate schema.org structured data in Python with Pydantic
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Define and validate schema.org structured data in Python with Pydantic "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pydantic\n",
"from pydantic import BaseModel, Schema\n",
"from pydantic.main import MetaModel\n",
"from schemaorg import main as schemaorg"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class Thing(BaseModel):\n",
" \"\"\"The most generic type of item.\"\"\"\n",
" \n",
" # Need to define extra items at the top level \n",
" class Config:\n",
" title = 'Thing'\n",
" schema_extra = {\n",
" '$schema': 'https://schema.org',\n",
" '$id': 'https://schema.org/Thing',\n",
" }\n",
" \n",
" additionalType: str = Schema(\n",
" ...,\n",
" title='additionalType',\n",
" description=(\n",
" \"An additional type for the item, typically \"\n",
" \"used for adding more specific types from \"\n",
" \"external vocabularies in microdata syntax. \"\n",
" \"This is a relationship between something and \"\n",
" \"a class that the thing is in. In RDFa syntax, \"\n",
" \"it is better to use the native RDFa syntax - \"\n",
" \"the 'typeof' attribute - for multiple types. \"\n",
" \"Schema.org tools may have only weaker \"\n",
" \"understanding of extra types, in particular \"\n",
" \"those defined externally.\"\n",
" )\n",
" )\n",
" \n",
" alternateName: str = Schema(\n",
" ...,\n",
" title='alternateName',\n",
" description=\"An alias for the item.\"\n",
" )\n",
" \n",
" description: str = Schema(\n",
" ...,\n",
" description=\"A description of the item.\"\n",
" )\n",
" \n",
" disambiguatingDescription: str = Schema(\n",
" ...,\n",
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"\n",
"class MetaSchema(MetaModel):\n",
" \"\"\"Metaclass that checks for three required class attributes:\n",
" 1. _id: the ID of the event\n",
" 2. _version: the version of the current schema.\n",
" 3. _title: the name of the schema.\n",
"\n",
" These attribute are mapped to pydantic.BaseModel's `Config` inner class\n",
" for proper schema generation+validation.\n",
" \"\"\"\n",
" def __new__(cls, name, base, dct):\n",
" # Check that required keys are found.\n",
" if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):\n",
" raise AttributeError('Required class attributes are missing from the {} class.'.format(name))\n",
"\n",
" # Check that keys are the proper types.\n",
" if not all((\n",
" type(dct['_id']) in (str, type(None)),\n",
" type(dct['_version']) in (float, type(None)),\n",
" type(dct['_title']) in (str, type(None)),\n",
" type(dct['_schema']) in (str, type(None)),\n",
" )):\n",
" raise TypeError('Check the class attributes types: \"_id\" must be a string, '\n",
" '\"_version\" must be an integer, and \"_title\" must be a string.')\n",
"\n",
" # Add a Config inner class to this Pydantic model.\n",
" class Config:\n",
" title = dct['_title']\n",
" schema_extra = {\n",
" '$id': dct['_id'],\n",
" '$schema': dct['_schema'],\n",
" 'version': dct['_version']\n",
" }\n",
"\n",
" dct['Config'] = Config\n",
" return super(MetaSchema, cls).__new__(cls, name, base, dct)\n",
"\n",
"\n",
"class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):\n",
" \"\"\"A pydantic base Model for JSON schemas.\"\"\"\n",
" _id: str = None\n",
" _version: float = None\n",
" _title: str = None\n",
" _schema: str = None"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class Thing(JsonSchema):\n",
" \"\"\"The most generic type of item.\"\"\"\n",
" # Define top level attributes\n",
" _id = 'https://schema.org/Thing'\n",
" _version = 3.9\n",
" _title = 'Thing'\n",
" _schema = 'https://schema.org'\n",
"\n",
" additionalType: str = Schema(\n",
" ...,\n",
" title='additionalType',\n",
" description=(\n",
" \"An additional type for the item, typically \"\n",
" \"used for adding more specific types from \"\n",
" \"external vocabularies in microdata syntax. \"\n",
" \"This is a relationship between something and \"\n",
" \"a class that the thing is in. In RDFa syntax, \"\n",
" \"it is better to use the native RDFa syntax - \"\n",
" \"the 'typeof' attribute - for multiple types. \"\n",
" \"Schema.org tools may have only weaker \"\n",
" \"understanding of extra types, in particular \"\n",
" \"those defined externally.\"\n",
" )\n",
" )\n",
" \n",
" alternateName: str = Schema(\n",
" ...,\n",
" title='alternateName',\n",
" description=\"An alias for the item.\"\n",
" )\n",
" \n",
" description: str = Schema(\n",
" ...,\n",
" description=\"A description of the item.\"\n",
" )\n",
" \n",
" disambiguatingDescription: str = Schema(\n",
" ...,\n",
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'title': 'Thing',\n",
" 'description': 'The most generic type of item.',\n",
" 'type': 'object',\n",
" 'properties': {'additionalType': {'title': 'additionalType',\n",
" 'description': \"An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. In RDFa syntax, it is better to use the native RDFa syntax - the 'typeof' attribute - for multiple types. Schema.org tools may have only weaker understanding of extra types, in particular those defined externally.\",\n",
" 'type': 'string'},\n",
" 'alternateName': {'title': 'alternateName',\n",
" 'description': 'An alias for the item.',\n",
" 'type': 'string'},\n",
" 'description': {'title': 'Description',\n",
" 'description': 'A description of the item.',\n",
" 'type': 'string'},\n",
" 'disambiguatingDescription': {'title': 'Disambiguatingdescription',\n",
" 'description': 'A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.',\n",
" 'type': 'string'}},\n",
" 'required': ['additionalType',\n",
" 'alternateName',\n",
" 'description',\n",
" 'disambiguatingDescription'],\n",
" '$id': 'https://schema.org/Thing',\n",
" '$schema': 'https://schema.org',\n",
" 'version': 3.9}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Thing.schema()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validate a new object"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What happens when we create an invalid object?"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Object did not validate.\n"
]
}
],
"source": [
"try: \n",
" thing = Thing()\n",
"except pydantic.ValidationError:\n",
" print(\"Object did not validate.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's try a valid object..."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"thing = Thing(\n",
" alternateName='New Thing',\n",
" description='This is a new thing',\n",
" disambiguatingDescription='This thing is unique.',\n",
" additionalType='No additional type'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"No error was raised."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Autogenerate pydantic objects from schema.org"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"class SchemaOrg(MetaSchema):\n",
" \n",
" def __new__(cls, name, base, dct):\n",
" annotations = {}\n",
" \n",
" data = schemaorg.Schema(name)\n",
" \n",
" dct = dict(\n",
" _title=name,\n",
" _id=data.id,\n",
" _version=float(data.version),\n",
" _schema=data.base,\n",
" __doc__=data.comment,\n",
" __annotations__={}\n",
" )\n",
"\n",
" # Currently, sets all class variables to type==str for\n",
" # demostration purposes.\n",
" # Need to develop datatypes for Schema.org objects.\n",
" for key, val in data._properties.items():\n",
" dct[key] = Schema(\n",
" ...,\n",
" description=val['comment'],\n",
" title=val['label']\n",
" )\n",
" dct['__annotations__'][key] = str\n",
"\n",
" base = (BaseModel,) + base\n",
" \n",
" return super(SchemaOrg, cls).__new__(cls, name, base, dct)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Specification base set to http://www.schema.org\n",
"Using Version 3.5\n",
"Found http://www.schema.org/Thing\n",
"Thing: found 12 properties\n",
"Specification base set to http://www.schema.org\n",
"Using Version 3.5\n",
"Found http://www.schema.org/Event\n",
"Event: found 47 properties\n"
]
}
],
"source": [
"class Thing(metaclass=SchemaOrg): pass\n",
"class Event(metaclass=SchemaOrg): pass"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Specification base set to http://www.schema.org\n",
"Using Version 3.5\n",
"Found http://www.schema.org/Person\n",
"Person: found 69 properties\n"
]
}
],
"source": [
"class Person(metaclass=SchemaOrg): pass"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "omnipotent (Python 3.7)",
"language": "python",
"name": "omnipotent"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment