Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gnthibault/fe76821105d5c2aacde3b331baadb14d to your computer and use it in GitHub Desktop.
Save gnthibault/fe76821105d5c2aacde3b331baadb14d to your computer and use it in GitHub Desktop.
DataFrame Validation with Pydantic (including self validating dataframs)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validating DataFrames with [Pydantic](https://pydantic-docs.helpmanual.io/)\n",
"\n",
"\n",
"## Row Model and value choices via Enum\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"from enum import Enum\n",
"import json\n",
"from typing import Any, List, Optional, Type\n",
"\n",
"from IPython.display import display\n",
"import pandas as pd\n",
"from pydantic import BaseModel, HttpUrl, constr, validator, Field\n",
"\n",
"\n",
"\n",
"class SentimentLabelEnum(str, Enum):\n",
" \"\"\"Choices for valid sentiment labels\"\"\"\n",
"\n",
" positive = \"positive\"\n",
" neutral = \"neutral\"\n",
" negative = \"negative\"\n",
"\n",
"\n",
"class ValidRow(BaseModel):\n",
" \"\"\"Model for a validated Row of data\"\"\"\n",
"\n",
" # str field, not empty string, up to 1000 characters long\n",
" text: constr(min_length=1, max_length=1000) # type: ignore\n",
"\n",
" # label field must be str and equal one of the enumerated values\n",
" label: SentimentLabelEnum\n",
"\n",
" # language code is a string of length 2\n",
" language_code: constr(min_length=2, max_length=2) # type: ignore\n",
"\n",
" # Url is str that matches Http url pattern\n",
" url: HttpUrl\n",
"\n",
" # author is str\n",
" author: str\n",
"\n",
" # Optional Location string\n",
" location: Optional[str]\n",
"\n",
" # Prevent attributes from being rewritten\n",
" class Config:\n",
" allow_mutation = False\n",
"\n",
" def __hash__(self) -> int:\n",
" \"\"\"Object identity is the hash of the url\"\"\"\n",
" return hash(self.url)\n",
"\n",
" def __eq__(self, other: object) -> bool:\n",
" \"\"\"Two rows are equal if they have the same url\"\"\"\n",
" return isinstance(other, ValidRow) and self.url == other.url\n",
"\n",
" def __repr__(self) -> str: # type: ignore\n",
" \"\"\"Nice string representation\"\"\"\n",
" return f\"<ValidRow url={self.url}>\"\n",
"\n",
" def __str__(self) -> str: # type: ignore\n",
" return self.__repr__()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example Validation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<ValidRow url=https://www.some.url.com/1>\n",
"\n",
"{'text': 'Some Sample Text', 'label': <SentimentLabelEnum.negative: 'negative'>, 'language_code': 'en', 'url': HttpUrl('https://www.some.url.com/1', scheme='https', host='www.some.url.com', tld='com', host_type='domain', path='/1'), 'author': 'bob', 'location': None}\n",
"\n",
"{\n",
" \"text\": \"Some Sample Text\",\n",
" \"label\": \"negative\",\n",
" \"language_code\": \"en\",\n",
" \"url\": \"https://www.some.url.com/1\",\n",
" \"author\": \"bob\",\n",
" \"location\": null\n",
"}\n"
]
}
],
"source": [
"## Good Data\n",
"\n",
"row = ValidRow(text=\"Some Sample Text\", \n",
" language_code=\"en\", \n",
" author=\"bob\", \n",
" url=\"https://www.some.url.com/1\",\n",
" label=\"negative\",\n",
" )\n",
"\n",
"print(row)\n",
"print()\n",
"print(row.dict())\n",
"print()\n",
"# Serialize to JSON\n",
"print(row.json(indent=4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Coercion Gotcha and alternate syntax for instantiation"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"text\": \"Some Sample Text\",\n",
" \"label\": \"negative\",\n",
" \"language_code\": \"en\",\n",
" \"url\": \"https://www.some.url.com/1\",\n",
" \"author\": \"bob\",\n",
" \"location\": \"5\"\n",
"}\n"
]
}
],
"source": [
"good_data = {\n",
" \"text\": \"Some Sample Text\",\n",
" \"label\": \"negative\",\n",
" \"language_code\": \"en\",\n",
" \"url\": \"https://www.some.url.com/1\",\n",
" \"author\": \"bob\",\n",
" \n",
" # Watch out for possible unintended type coercion (int -> str). otherwise use StrictStr \n",
" # https://pydantic-docs.helpmanual.io/usage/types/#strict-types\n",
" \"location\": 5\n",
"}\n",
"\n",
"\n",
"\n",
"\n",
"# use keyword unpacking operator on dictionary\n",
"row = ValidRow(**good_data)\n",
"\n",
"print(row.json(indent=4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bad Data with multiple error messages"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "ValidationError",
"evalue": "5 validation errors for ValidRow\ntext\n ensure this value has at least 1 characters (type=value_error.any_str.min_length; limit_value=1)\nlabel\n value is not a valid enumeration member; permitted: 'positive', 'neutral', 'negative' (type=type_error.enum; enum_values=[<SentimentLabelEnum.positive: 'positive'>, <SentimentLabelEnum.neutral: 'neutral'>, <SentimentLabelEnum.negative: 'negative'>])\nlanguage_code\n ensure this value has at most 2 characters (type=value_error.any_str.max_length; limit_value=2)\nurl\n invalid or missing URL scheme (type=value_error.url.scheme)\nauthor\n none is not an allowed value (type=type_error.none.not_allowed)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-07f8386a0c3c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mauthor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"some.url.com/1\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Negative\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m )\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pydantic/main.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalidate_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__dict__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__fields_set__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 5 validation errors for ValidRow\ntext\n ensure this value has at least 1 characters (type=value_error.any_str.min_length; limit_value=1)\nlabel\n value is not a valid enumeration member; permitted: 'positive', 'neutral', 'negative' (type=type_error.enum; enum_values=[<SentimentLabelEnum.positive: 'positive'>, <SentimentLabelEnum.neutral: 'neutral'>, <SentimentLabelEnum.negative: 'negative'>])\nlanguage_code\n ensure this value has at most 2 characters (type=value_error.any_str.max_length; limit_value=2)\nurl\n invalid or missing URL scheme (type=value_error.url.scheme)\nauthor\n none is not an allowed value (type=type_error.none.not_allowed)"
]
}
],
"source": [
"## Bad Data and validition error messages\n",
"\n",
"invalid_row = ValidRow(text=\"\", \n",
" language_code=\"english\", \n",
" author=None, \n",
" url=\"some.url.com/1\",\n",
" label=\"Negative\",\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Validating many rows with nested models\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"class ValidCollection(BaseModel):\n",
" \"\"\"Model for a validated collection of rows\"\"\"\n",
"\n",
" # list of valid rows, minimum 1 row\n",
" rows: List[ValidRow] = Field(..., min_items=1)\n",
"\n",
" # prevent attribute rewrite.\n",
" # does not prevent modifying mutable values (i.e. append to list)\n",
" class Config:\n",
" allow_mutation = False\n",
"\n",
" \n",
" @validator(\"rows\")\n",
" def check_unique(cls, rows: List[ValidRow]) -> List[ValidRow]:\n",
" \"\"\"Assert no duplicate rows (based on unique url)\"\"\"\n",
"\n",
" # hash rows in a set to detect duplicates\n",
" if len(set(rows)) != len(rows):\n",
"\n",
" counts = Counter(rows)\n",
"\n",
" duplicates = {\n",
" name: count for name, count in counts.most_common() if count > 1\n",
" }\n",
" raise ValueError(f\"Duplicate Items found: {duplicates}\")\n",
" return rows\n",
"\n",
" \n",
" def __repr__(self) -> str: # type: ignore # pragma: no cover\n",
" return f\"<ValidCollection rows=['{self.rows[0].__repr__()}...]'>\"\n",
" \n",
" def __str__(self) -> str: # type: ignore\n",
" return self.__repr__()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example Validation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<ValidCollection rows=['<ValidRow url=https://www.some.url.com/1>...]'>\n",
"\n",
"{'rows': [{'text': 'Some sample Text', 'label': <SentimentLabelEnum.positive: 'positive'>, 'language_code': 'en', 'url': HttpUrl('https://www.some.url.com/1', scheme='https', host='www.some.url.com', tld='com', host_type='domain', path='/1'), 'author': 'bob', 'location': None}, {'text': 'Some sample Text', 'label': <SentimentLabelEnum.negative: 'negative'>, 'language_code': 'en', 'url': HttpUrl('https://www.some.url.com/2', scheme='https', host='www.some.url.com', tld='com', host_type='domain', path='/2'), 'author': 'bob', 'location': 'USA'}]}\n",
"\n",
"{\n",
" \"rows\": [\n",
" {\n",
" \"text\": \"Some sample Text\",\n",
" \"label\": \"positive\",\n",
" \"language_code\": \"en\",\n",
" \"url\": \"https://www.some.url.com/1\",\n",
" \"author\": \"bob\",\n",
" \"location\": null\n",
" },\n",
" {\n",
" \"text\": \"Some sample Text\",\n",
" \"label\": \"negative\",\n",
" \"language_code\": \"en\",\n",
" \"url\": \"https://www.some.url.com/2\",\n",
" \"author\": \"bob\",\n",
" \"location\": \"USA\"\n",
" }\n",
" ]\n",
"}\n"
]
}
],
"source": [
"## Good Data\n",
"\n",
"good_data = [\n",
" {\"text\": \"Some sample Text\", \n",
" \"language_code\": \"en\", \n",
" \"label\": \"positive\", \n",
" \"author\": \"bob\", \n",
" \"url\":\"https://www.some.url.com/1\"},\n",
" {\"text\": \"Some sample Text\", \n",
" \"language_code\": \"en\", \n",
" \"label\": \"negative\", \n",
" \"author\": \"bob\", \n",
" \"url\":\"https://www.some.url.com/2\",\n",
" \"location\": \"USA\"},\n",
"]\n",
"\n",
"collection = ValidCollection(rows=good_data)\n",
"\n",
"print(collection)\n",
"print()\n",
"print(collection.dict())\n",
"print()\n",
"print(collection.json(indent=4))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "ValidationError",
"evalue": "1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2} (type=value_error)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-f2f60c3dd851>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 15\u001b[0m ]\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0minvalid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mValidCollection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbad_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pydantic/main.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalidate_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__dict__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__fields_set__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2} (type=value_error)"
]
}
],
"source": [
"# Bad Data Validation\n",
"\n",
"bad_data = [\n",
" {\"text\": \"Some sample Text\", \n",
" \"language_code\": \"en\", \n",
" \"label\": \"positive\", \n",
" \"author\": \"bob\", \n",
" \"url\":\"https://www.some.url.com/1\"},\n",
" {\"text\": \"Some sample Text\", \n",
" \"language_code\": \"en\", \n",
" \"label\": \"negative\", \n",
" \"author\": \"bob\", \n",
" \"url\":\"https://www.some.url.com/1\",\n",
" \"location\": \"USA\"},\n",
"]\n",
"\n",
"invalid = ValidCollection(rows=bad_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Self-Validating DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\n",
"class ValidDataframe(pd.DataFrame):\n",
" \"\"\"subclass that validates on instatiation\"\"\"\n",
"\n",
" @property\n",
" def _constructor(self) -> Type[\"ValidDataframe\"]:\n",
" \"\"\"Use the constructor for this type when returning a new dataframe\"\"\"\n",
" return ValidDataframe\n",
"\n",
" def __init__(self, *args: Any, verbose: bool= True, **kwargs: Any) -> None:\n",
" \"\"\"instantiate as normal, then validate using pydantic\"\"\"\n",
" super().__init__(*args, **kwargs)\n",
" \n",
" ## Print out validating message\n",
" if verbose: \n",
" print(\"Validating The DATA!\")\n",
" \n",
" ValidCollection(rows=self.to_dict(orient=\"records\"))\n",
" \n",
" if verbose:\n",
" print(\"LOoks good!\")\n",
" \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Same Api a normal pandas dataframe"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>language_code</th>\n",
" <th>label</th>\n",
" <th>author</th>\n",
" <th>url</th>\n",
" <th>location</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Some sample Text</td>\n",
" <td>en</td>\n",
" <td>positive</td>\n",
" <td>bob</td>\n",
" <td>https://www.some.url.com/1</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Some sample Text</td>\n",
" <td>en</td>\n",
" <td>negative</td>\n",
" <td>bob</td>\n",
" <td>https://www.some.url.com/2</td>\n",
" <td>USA</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text language_code label author \\\n",
"0 Some sample Text en positive bob \n",
"1 Some sample Text en negative bob \n",
"\n",
" url location \n",
"0 https://www.some.url.com/1 NaN \n",
"1 https://www.some.url.com/2 USA "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating The DATA!\n",
"LOoks good!\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>language_code</th>\n",
" <th>label</th>\n",
" <th>author</th>\n",
" <th>url</th>\n",
" <th>location</th>\n",
" <th>new_column</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Some sample Text</td>\n",
" <td>en</td>\n",
" <td>positive</td>\n",
" <td>bob</td>\n",
" <td>https://www.some.url.com/1</td>\n",
" <td>NaN</td>\n",
" <td>new_stuff_added</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Some sample Text</td>\n",
" <td>en</td>\n",
" <td>negative</td>\n",
" <td>bob</td>\n",
" <td>https://www.some.url.com/2</td>\n",
" <td>USA</td>\n",
" <td>new_stuff_added</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text language_code label author \\\n",
"0 Some sample Text en positive bob \n",
"1 Some sample Text en negative bob \n",
"\n",
" url location new_column \n",
"0 https://www.some.url.com/1 NaN new_stuff_added \n",
"1 https://www.some.url.com/2 USA new_stuff_added "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Here is a normal df\n",
"normal_df = pd.DataFrame(good_data)\n",
"display(normal_df)\n",
"\n",
"\n",
"# Here is a validated df\n",
"validated_df = ValidDataframe(good_data, verbose=True)\n",
"\n",
"\n",
"# add new column (does not rerun validation)\n",
"validated_df[\"new_column\"] = \"new_stuff_added\"\n",
"display(validated_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating The DATA!\n"
]
},
{
"ename": "ValidationError",
"evalue": "1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2} (type=value_error)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-13-d52ff45ebd06>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Bad Data throws an error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mValidDataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbad_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-9-0b0498e37118>\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, verbose, *args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Validating The DATA!\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mValidCollection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morient\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"records\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pydantic/main.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalidate_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__dict__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__fields_set__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2} (type=value_error)"
]
}
],
"source": [
"# Bad Data throws an error\n",
"\n",
"ValidDataframe(bad_data, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Operations return validated dataframe"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating The DATA!\n",
"LOoks good!\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>language_code</th>\n",
" <th>label</th>\n",
" <th>author</th>\n",
" <th>url</th>\n",
" <th>location</th>\n",
" <th>new_column</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Some sample Text</td>\n",
" <td>en</td>\n",
" <td>negative</td>\n",
" <td>bob</td>\n",
" <td>https://www.some.url.com/2</td>\n",
" <td>USA</td>\n",
" <td>new_stuff_added</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text language_code label author \\\n",
"1 Some sample Text en negative bob \n",
"\n",
" url location new_column \n",
"1 https://www.some.url.com/2 USA new_stuff_added "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class '__main__.ValidDataframe'>\n"
]
}
],
"source": [
"\n",
"# filter to 1 row and revalidate\n",
"filtered_valid = validated_df[validated_df.location==\"USA\"]\n",
"\n",
"display(filtered_valid)\n",
"\n",
"print(type(filtered_valid))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Operations can invalidate the output"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating The DATA!\n"
]
},
{
"ename": "ValidationError",
"evalue": "1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2, <ValidRow url=https://www.some.url.com/2>: 2} (type=value_error)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-15-a7f8e21b0c85>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# concatenation causes duplicates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvalidated_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidated_df\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36mconcat\u001b[0;34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[0m\n\u001b[1;32m 282\u001b[0m )\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 284\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 285\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/reshape/concat.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0mcons\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobjs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m return cons._from_axes(new_data, self.new_axes).__finalize__(\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"concat\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m )\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_from_axes\u001b[0;34m(cls, data, axes, **kwargs)\u001b[0m\n\u001b[1;32m 386\u001b[0m \u001b[0;31m# for construction from BlockManager\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBlockManager\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 388\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 389\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_AXIS_REVERSED\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-9-0b0498e37118>\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, verbose, *args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Validating The DATA!\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mValidCollection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morient\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"records\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pydantic/main.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalidate_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__dict__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__fields_set__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for ValidCollection\nrows\n Duplicate Items found: {<ValidRow url=https://www.some.url.com/1>: 2, <ValidRow url=https://www.some.url.com/2>: 2} (type=value_error)"
]
}
],
"source": [
"# concatenation causes duplicates\n",
"pd.concat([validated_df, validated_df])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating The DATA!\n"
]
},
{
"ename": "ValidationError",
"evalue": "1 validation error for ValidCollection\nrows\n ensure this value has at least 1 items (type=value_error.list.min_items; limit_value=1)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-17-95d5dc82f75f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# filter to zero rows is a validation error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mvalidated_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvalidated_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlanguage_code\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m\"es\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2789\u001b[0m \u001b[0;31m# Do we have a (boolean) 1d indexer?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2790\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_bool_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2791\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_bool_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2792\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2793\u001b[0m \u001b[0;31m# We are left with two options: a single key, and a collection of keys,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_getitem_bool_array\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2843\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_bool_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2844\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnonzero\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2845\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_take_with_is_copy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2846\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2847\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_take_with_is_copy\u001b[0;34m(self, indices, axis, **kwargs)\u001b[0m\n\u001b[1;32m 3407\u001b[0m \u001b[0mSee\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdocstring\u001b[0m \u001b[0mof\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfull\u001b[0m \u001b[0mexplanation\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3408\u001b[0m \"\"\"\n\u001b[0;32m-> 3409\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3410\u001b[0m \u001b[0;31m# Maybe set copy if we didn't actually change the index.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3411\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mequals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mtake\u001b[0;34m(self, indices, axis, is_copy, **kwargs)\u001b[0m\n\u001b[1;32m 3395\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_block_manager_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3396\u001b[0m )\n\u001b[0;32m-> 3397\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3399\u001b[0m def _take_with_is_copy(\n",
"\u001b[0;32m<ipython-input-9-0b0498e37118>\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, verbose, *args, **kwargs)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Validating The DATA!\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mValidCollection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morient\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"records\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/typed_pandas/lib/python3.7/site-packages/pydantic/main.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_error\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalidate_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mvalidation_error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__dict__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__pydantic_self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__fields_set__'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for ValidCollection\nrows\n ensure this value has at least 1 items (type=value_error.list.min_items; limit_value=1)"
]
}
],
"source": [
"# filter to zero rows is a validation error\n",
"\n",
"validated_df[validated_df.language_code==\"es\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model Schema OpenAPI format"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"title\": \"ValidCollection\",\n",
" \"description\": \"Model for a validated collection of rows\",\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"rows\": {\n",
" \"title\": \"Rows\",\n",
" \"type\": \"array\",\n",
" \"items\": {\n",
" \"$ref\": \"#/definitions/ValidRow\"\n",
" },\n",
" \"minItems\": 1\n",
" }\n",
" },\n",
" \"required\": [\n",
" \"rows\"\n",
" ],\n",
" \"definitions\": {\n",
" \"ValidRow\": {\n",
" \"title\": \"ValidRow\",\n",
" \"description\": \"Model for a validated Row of data\",\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"text\": {\n",
" \"title\": \"Text\",\n",
" \"type\": \"string\",\n",
" \"minLength\": 1,\n",
" \"maxLength\": 1000\n",
" },\n",
" \"label\": {\n",
" \"title\": \"Label\",\n",
" \"enum\": [\n",
" \"positive\",\n",
" \"neutral\",\n",
" \"negative\"\n",
" ],\n",
" \"type\": \"string\"\n",
" },\n",
" \"language_code\": {\n",
" \"title\": \"Language Code\",\n",
" \"type\": \"string\",\n",
" \"minLength\": 2,\n",
" \"maxLength\": 2\n",
" },\n",
" \"url\": {\n",
" \"title\": \"Url\",\n",
" \"type\": \"string\",\n",
" \"minLength\": 1,\n",
" \"maxLength\": 2083,\n",
" \"format\": \"uri\"\n",
" },\n",
" \"author\": {\n",
" \"title\": \"Author\",\n",
" \"type\": \"string\"\n",
" },\n",
" \"location\": {\n",
" \"title\": \"Location\",\n",
" \"type\": \"string\"\n",
" }\n",
" },\n",
" \"required\": [\n",
" \"text\",\n",
" \"label\",\n",
" \"language_code\",\n",
" \"url\",\n",
" \"author\"\n",
" ]\n",
" }\n",
" }\n",
"}\n"
]
}
],
"source": [
"print(ValidCollection.schema_json(indent=4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Performance Costs\n",
"\n",
"### Validate a 2 row dataframe 1000 times"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"885 ms ± 39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"for x in range(1000):\n",
" normal_df = pd.DataFrame(good_data)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.59 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"for x in range(1000):\n",
" validated_df = ValidDataframe(good_data, verbose=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Validate a 1000 row dataframe 100 times"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Make 1000 rows with unique urls\n",
"\n",
"bigger_data = []\n",
"for x in range(1000):\n",
" item = good_data[0].copy()\n",
" item[\"url\"] = item[\"url\"]+ \"/\"+ str(x)\n",
" bigger_data.append(item)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"172 ms ± 4.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"for x in range(100):\n",
" normal_df = pd.DataFrame(bigger_data)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5.38 s ± 197 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"\n",
"for x in range(100):\n",
" validated_df = ValidDataframe(bigger_data, verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment