Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save danyx23/30e290e0e3fe1cf93e6583e06ef54634 to your computer and use it in GitHub Desktop.
Save danyx23/30e290e0e3fe1cf93e6583e06ef54634 to your computer and use it in GitHub Desktop.
Jupyter notebook to validate all OWID grapher configs against a given JSON schema
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Grapher json-schema test\n",
"\n",
"This script checks all grapher configs in the db against the provided schema.json\n",
"\n",
"This notebook uses Lars' [mysql jupyter notebook magic](https://gist.github.com/larsyencken/935452e59246a58c647dd3953de2ab07)"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 54,
"source": [
"import pandas as pd\n",
"import json\n",
"from jsonschema import validate\n",
"from dataclasses import dataclass\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 55,
"source": [
"schemapath = \"./schema experiments/schema.json\""
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 56,
"source": [
"schema = None\n",
"with open(schemapath, \"r\") as f:\n",
" schema = json.load(f)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 57,
"source": [
"# Declare the default dataframe so autocomplete works\n",
"df : pd.DataFrame = pd.DataFrame()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 58,
"source": [
"%%mysql\n",
"select config from charts"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 59,
"source": [
"df"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" config\n",
"0 {\"id\": 16, \"map\": {\"colorScale\": {\"baseColorSc...\n",
"1 {\"id\": 20, \"map\": {\"colorScale\": {\"baseColorSc...\n",
"2 {\"id\": 26, \"map\": {\"colorScale\": {\"baseColorSc...\n",
"3 {\"id\": 27, \"map\": {\"colorScale\": {\"baseColorSc...\n",
"4 {\"id\": 29, \"map\": {\"mode\": \"specific\", \"maxYea...\n",
"... ...\n",
"4184 {\"map\": {\"colorScale\": {\"baseColorScheme\": \"Yl...\n",
"4185 {\"map\": {\"colorScale\": {\"baseColorScheme\": \"Yl...\n",
"4186 {\"data\": {\"availableEntities\": [\"United Kingdo...\n",
"4187 {\"id\": 5071, \"data\": {\"availableEntities\": [\"U...\n",
"4188 {\"id\": 5072, \"data\": {\"availableEntities\": [\"S...\n",
"\n",
"[4189 rows x 1 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>config</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>{\"id\": 16, \"map\": {\"colorScale\": {\"baseColorSc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>{\"id\": 20, \"map\": {\"colorScale\": {\"baseColorSc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>{\"id\": 26, \"map\": {\"colorScale\": {\"baseColorSc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>{\"id\": 27, \"map\": {\"colorScale\": {\"baseColorSc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>{\"id\": 29, \"map\": {\"mode\": \"specific\", \"maxYea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4184</th>\n",
" <td>{\"map\": {\"colorScale\": {\"baseColorScheme\": \"Yl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4185</th>\n",
" <td>{\"map\": {\"colorScale\": {\"baseColorScheme\": \"Yl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4186</th>\n",
" <td>{\"data\": {\"availableEntities\": [\"United Kingdo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4187</th>\n",
" <td>{\"id\": 5071, \"data\": {\"availableEntities\": [\"U...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4188</th>\n",
" <td>{\"id\": 5072, \"data\": {\"availableEntities\": [\"S...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4189 rows × 1 columns</p>\n",
"</div>"
]
},
"metadata": {},
"execution_count": 59
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 60,
"source": [
"df[\"configjson\"] = df.config.map(lambda val: json.loads(val))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 61,
"source": [
"validate(df.configjson[0], schema)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 62,
"source": [
"@dataclass\n",
"class ConfigValidationError:\n",
" id: int\n",
" error: Exception"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 63,
"source": [
"successes = []\n",
"failures = []\n",
"for config in df.configjson:\n",
" id = config.get(\"id\", None)\n",
" try: \n",
" validate(config, schema)\n",
" successes.append(id)\n",
" except Exception as e:\n",
" failures.append(ConfigValidationError(id=id, error=e))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 64,
"source": [
"len(successes)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"4181"
]
},
"metadata": {},
"execution_count": 64
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 65,
"source": [
"len(failures)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"8"
]
},
"metadata": {},
"execution_count": 65
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 66,
"source": [
"failures"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[ConfigValidationError(id=195, error=<ValidationError: \"'-3000' is not of type 'integer'\">),\n",
" ConfigValidationError(id=264, error=<ValidationError: \"'1950' is not one of ['latest']\">),\n",
" ConfigValidationError(id=271, error=<ValidationError: \"'2000' is not one of ['latest']\">),\n",
" ConfigValidationError(id=330, error=<ValidationError: \"'' is not one of ['absolute', 'relative', 'grouped', 'stacked', None]\">),\n",
" ConfigValidationError(id=332, error=<ValidationError: \"'' is not one of ['absolute', 'relative', 'grouped', 'stacked', None]\">),\n",
" ConfigValidationError(id=334, error=<ValidationError: \"'' is not one of ['absolute', 'relative', 'grouped', 'stacked', None]\">),\n",
" ConfigValidationError(id=538, error=<ValidationError: \"'2015' is not one of ['latest']\">),\n",
" ConfigValidationError(id=796, error=<ValidationError: \"'2007' is not one of ['latest']\">)]"
]
},
"metadata": {},
"execution_count": 66
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 67,
"source": [
"[ item.error.absolute_path for item in failures ]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[deque(['map', 'minYear']),\n",
" deque(['map', 'targetYear']),\n",
" deque(['map', 'targetYear']),\n",
" deque(['stackMode']),\n",
" deque(['stackMode']),\n",
" deque(['stackMode']),\n",
" deque(['map', 'defaultYear']),\n",
" deque(['map', 'defaultYear'])]"
]
},
"metadata": {},
"execution_count": 67
}
],
"metadata": {}
}
],
"metadata": {
"interpreter": {
"hash": "40c188d7f09e73b6baff016c76fe93234271a5f8c00107ec837ff90530bba4cc"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.9.1 64-bit ('base': conda)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment