Skip to content

Instantly share code, notes, and snippets.

@tonyfast
Forked from Zsailer/schemaorg-pydantic.ipynb
Last active August 27, 2019 16:26
Show Gist options
  • Save tonyfast/61a17214486f1e1947707e1c65ace378 to your computer and use it in GitHub Desktop.
Save tonyfast/61a17214486f1e1947707e1c65ace378 to your computer and use it in GitHub Desktop.
Define and validate schema.org structured data in Python with Pydantic
pydantic
rdflib
pandas
matplotlib
schemaorg
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Define and validate schema.org structured data in Python with Pydantic "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pydantic import BaseModel, Schema\n",
"from pydantic.main import MetaModel\n",
"from schemaorg import main as schemaorg"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Thing(BaseModel):\n",
" \"\"\"The most generic type of item.\"\"\"\n",
" \n",
" # Need to define extra items at the top level \n",
" class Config:\n",
" title = 'Thing'\n",
" schema_extra = {\n",
" '$schema': 'https://schema.org',\n",
" '$id': 'https://schema.org/Thing',\n",
" }\n",
" \n",
" additionalType: str = Schema(\n",
" ...,\n",
" title='additionalType',\n",
" description=(\n",
" \"An additional type for the item, typically \"\n",
" \"used for adding more specific types from \"\n",
" \"external vocabularies in microdata syntax. \"\n",
" \"This is a relationship between something and \"\n",
" \"a class that the thing is in. In RDFa syntax, \"\n",
" \"it is better to use the native RDFa syntax - \"\n",
" \"the 'typeof' attribute - for multiple types. \"\n",
" \"Schema.org tools may have only weaker \"\n",
" \"understanding of extra types, in particular \"\n",
" \"those defined externally.\"\n",
" )\n",
" )\n",
" \n",
" alternateName: str = Schema(\n",
" ...,\n",
" title='alternateName',\n",
" description=\"An alias for the item.\"\n",
" )\n",
" \n",
" description: str = Schema(\n",
" ...,\n",
" description=\"A description of the item.\"\n",
" )\n",
" \n",
" disambiguatingDescription: str = Schema(\n",
" ...,\n",
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"class MetaSchema(MetaModel):\n",
" \"\"\"Metaclass that checks for three required class attributes:\n",
" 1. _id: the ID of the event\n",
" 2. _version: the version of the current schema.\n",
" 3. _title: the name of the schema.\n",
"\n",
" These attribute are mapped to pydantic.BaseModel's `Config` inner class\n",
" for proper schema generation+validation.\n",
" \"\"\"\n",
" def __new__(cls, name, base, dct):\n",
" # Check that required keys are found.\n",
" if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):\n",
" raise AttributeError('Required class attributes are missing from the {} class.'.format(name))\n",
"\n",
" # Check that keys are the proper types.\n",
" if not all((\n",
" type(dct['_id']) in (str, type(None)),\n",
" type(dct['_version']) in (float, type(None)),\n",
" type(dct['_title']) in (str, type(None)),\n",
" type(dct['_schema']) in (str, type(None)),\n",
" )):\n",
" raise TypeError('Check the class attributes types: \"_id\" must be a string, '\n",
" '\"_version\" must be an integer, and \"_title\" must be a string.')\n",
"\n",
" # Add a Config inner class to this Pydantic model.\n",
" class Config:\n",
" title = dct['_title']\n",
" schema_extra = {\n",
" '$id': dct['_id'],\n",
" '$schema': dct['_schema'],\n",
" 'version': dct['_version']\n",
" }\n",
"\n",
" dct['Config'] = Config\n",
" return super(MetaSchema, cls).__new__(cls, name, base, dct)\n",
"\n",
"\n",
"class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):\n",
" \"\"\"A pydantic base Model for JSON schemas.\"\"\"\n",
" _id: str = None\n",
" _version: float = None\n",
" _title: str = None\n",
" _schema: str = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Thing(JsonSchema):\n",
" \"\"\"The most generic type of item.\"\"\"\n",
" # Define top level attributes\n",
" _id = 'https://schema.org/Thing'\n",
" _version = 3.9\n",
" _title = 'Thing'\n",
" _schema = 'https://schema.org'\n",
"\n",
" additionalType: str = Schema(\n",
" ...,\n",
" title='additionalType',\n",
" description=(\n",
" \"An additional type for the item, typically \"\n",
" \"used for adding more specific types from \"\n",
" \"external vocabularies in microdata syntax. \"\n",
" \"This is a relationship between something and \"\n",
" \"a class that the thing is in. In RDFa syntax, \"\n",
" \"it is better to use the native RDFa syntax - \"\n",
" \"the 'typeof' attribute - for multiple types. \"\n",
" \"Schema.org tools may have only weaker \"\n",
" \"understanding of extra types, in particular \"\n",
" \"those defined externally.\"\n",
" )\n",
" )\n",
" \n",
" alternateName: str = Schema(\n",
" ...,\n",
" title='alternateName',\n",
" description=\"An alias for the item.\"\n",
" )\n",
" \n",
" description: str = Schema(\n",
" ...,\n",
" description=\"A description of the item.\"\n",
" )\n",
" \n",
" disambiguatingDescription: str = Schema(\n",
" ...,\n",
" description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Thing.schema()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validate a new object"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What happens when we create an invalid object?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"thing = Thing()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's try a valid object..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"thing = Thing(\n",
" alternateName='New Thing',\n",
" description='This is a new thing',\n",
" disambiguatingDescription='This thing is unique.',\n",
" additionalType='No additional type'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"No error was raised."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Autogenerate pydantic objects from schema.org"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class SchemaOrg(MetaSchema):\n",
" \n",
" def __new__(cls, name, base, dct):\n",
" annotations = {}\n",
" \n",
" data = schemaorg.Schema(name)\n",
" \n",
" dct = dict(\n",
" _title=name,\n",
" _id=data.id,\n",
" _version=float(data.version),\n",
" _schema=data.base,\n",
" __doc__=data.comment,\n",
" __annotations__={}\n",
" )\n",
"\n",
" # Currently, sets all class variables to type==str for\n",
" # demostration purposes.\n",
" # Need to develop datatypes for Schema.org objects.\n",
" for key, val in data._properties.items():\n",
" dct[key] = Schema(\n",
" ...,\n",
" description=val['comment'],\n",
" title=val['label']\n",
" )\n",
" dct['__annotations__'][key] = str\n",
"\n",
" base = (BaseModel,) + base\n",
" \n",
" return super(SchemaOrg, cls).__new__(cls, name, base, dct)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Thing(metaclass=SchemaOrg): pass\n",
"class Event(metaclass=SchemaOrg): pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Thing.schema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Event.schema()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment