pjbull/fspath-open-journey.ipynb

## fspath-open-journey.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Knowing if `open` called your `__fspath__`: A journey\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's write out a file to use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -f hello.txt\n",
    "!echo \"hi!\" >> hello.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First pass is getting `code_context`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import inspect\n",
    "from pathlib import Path\n",
    "\n",
    "class C0:\n",
    "    def __fspath__(self):       \n",
    "        caller_src = inspect.getframeinfo(inspect.stack()[1].frame).code_context\n",
    "        \n",
    "        print(caller_src)\n",
    "        \n",
    "        return str(Path(\"hello.txt\").resolve())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['with open(cell_0, \"r\") as f:\\n']\n"
     ]
    }
   ],
   "source": [
    "cell_0 = C0()\n",
    "   \n",
    "with open(cell_0, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['    \"r\"\\n']\n"
     ]
    }
   ],
   "source": [
    "with open(\n",
    "    cell_0,\n",
    "    \"r\"\n",
    ") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Well, that's painful.... what else?\n",
    "\n",
    "\n",
    "## Let's try `inspect.getsource` so that we go beyond `code_context`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class C1:\n",
    "    def __fspath__(self):       \n",
    "        caller_src = inspect.getsource(\n",
    "            inspect.stack()[1].frame\n",
    "        )\n",
    "        \n",
    "        print(caller_src)\n",
    "        \n",
    "        return str(Path(\"hello.txt\").resolve())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cell_1 = C1()\n",
      "   \n",
      "with open(cell_1, \"r\") as f:\n",
      "    assert f.read() == \"hi!\\n\"\n",
      "\n"
     ]
    }
   ],
   "source": [
    "cell_1 = C1()\n",
    "   \n",
    "with open(cell_1, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# I'm weirdly spaced\n",
      "with open(\n",
      "    cell_1,\n",
      "    \"r\"\n",
      ") as f:\n",
      "    assert f.read() == \"hi!\\n\"\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# I'm weirdly spaced\n",
    "with open(\n",
    "    cell_1,\n",
    "    \"r\"\n",
    ") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Woo! We got some code, now let's build a regex to match the open write modes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# thanks @jayqi for tracking all these down\n",
    "WRITE_MODES = {\"r+\", \"w\", \"w+\", \"a\", \"a+\", \"rb+\", \"wb\", \"wb+\", \"ab\", \"ab+\"}\n",
    "\n",
    "# regex escape `+`\n",
    "RE_WRITE_MODES = {s.replace(\"+\", \"\\+\") for s in WRITE_MODES}\n",
    "\n",
    "\n",
    "pattern = re.compile(\n",
    "    \"open\\(\"\n",
    "    \"[^,]+\"\n",
    "    \"[^\\\"]*\"\n",
    "    \"[\\\"']\" \n",
    "    \"(?P<mode>\" +\n",
    "    \"|\".join(RE_WRITE_MODES) +\n",
    "    \")\"\n",
    "    \"[\\\"']\"\n",
    "    \"\\)\"\n",
    ")\n",
    "\n",
    "def _write_from_open_call(source):\n",
    "    m = re.search(\n",
    "        pattern,\n",
    "        source,   \n",
    "    )\n",
    "    \n",
    "    return m is not None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_write_from_open_call(\n",
    "\"\"\"\n",
    "with open(cell_1, \"wb+\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_write_from_open_call(\n",
    "\"\"\"\n",
    "with open(cell_1, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_write_from_open_call(\n",
    "\"\"\"\n",
    "with close(cell_1, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_write_from_open_call(\n",
    "\"\"\"\n",
    "with close(cell_1, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    \n",
    "with open(Path('not_a_C2'), 'w') as f2:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's all well and good, but what if the source context is a lot longer and we have multiple `open` and the `S3Path` version is a read, but some other one is a write? The last `True` above should be `False`...\n",
    "\n",
    "\n",
    "## Down the rabbit-hole: parse the AST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "import inspect\n",
    "\n",
    "\n",
    "def _is_open_call_write_with_var(ast_node, var_names=None, var_type=None):\n",
    "    \"\"\" For a given AST node, check that the node is a `Call`, and that the\n",
    "        call is to a function with the name `open`, and that the last argument\n",
    "        \n",
    "        If passed, return True if the first argument is a variable with a name in var_names.\n",
    "        \n",
    "        If passed, return True if the first arg is a Call to instantiate var_type. \n",
    "    \"\"\"\n",
    "    if not isinstance(ast_node, ast.Call):\n",
    "        return False\n",
    "    if not hasattr(ast_node, \"func\"):\n",
    "        return False\n",
    "    if not hasattr(ast_node.func, \"id\"):\n",
    "        return False\n",
    "    if ast_node.func.id != \"open\":\n",
    "        return False\n",
    "    \n",
    "    # we are in an open call, get the path as first arg\n",
    "    path = ast_node.args[0]\n",
    "    \n",
    "    # get the mode as second arg or kwarg where arg==mode\n",
    "    mode = (\n",
    "        ast_node.args[1]\n",
    "        if len(ast_node.args) >= 2 else\n",
    "        [kwarg for kwarg in ast_node.keywords if kwarg.arg == \"mode\"][0].value\n",
    "    )\n",
    "    \n",
    "    # Ensure the path is either a call to instantiate var_type or\n",
    "    # the name of a variable we know is of the right type\n",
    "    path_is_of_type = (\n",
    "        (isinstance(path, ast.Call)\n",
    "         and path.func.id == var_type.__name__\n",
    "        )\n",
    "        or\n",
    "        (hasattr(path, \"id\") and (path.id in var_names))\n",
    "    )\n",
    "    \n",
    "    return (mode.s in WRITE_MODES) and path_is_of_type\n",
    "\n",
    "class C2:\n",
    "    def __fspath__(self):\n",
    "        # same getsource\n",
    "        caller_src = inspect.getsource(\n",
    "            inspect.stack()[1].frame\n",
    "        )\n",
    "\n",
    "        # also get local variables in the frame\n",
    "        caller_local_variables = inspect.stack()[1].frame.f_locals\n",
    "        \n",
    "        # get all the instances in the previous frame of our class\n",
    "        instances_of_type = [\n",
    "            varname for varname, instance in caller_local_variables.items()\n",
    "            if isinstance(instance, type(self))\n",
    "        ]\n",
    "                \n",
    "        # Walk the AST of the previous frame source and see if\n",
    "        # open is called with a variable of our type...\n",
    "        print(\n",
    "            any(\n",
    "                _is_open_call_write_with_var(\n",
    "                    n,\n",
    "                    var_names=instances_of_type,\n",
    "                    var_type=type(self)\n",
    "                ) for n in ast.walk(ast.parse(caller_src))\n",
    "            )\n",
    "        )\n",
    "\n",
    "        return str(Path(\"hello.txt\").resolve())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    }
   ],
   "source": [
    "cell_2 = C2()\n",
    "    \n",
    "# False = mode is r\n",
    "with open(cell_2, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# True - with var `cell_2`, which is of type \n",
    "with open(cell_2, \"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# True - var `cell_2`, which is of type (mode is a kwarg)\n",
    "with open(cell_2, mode=\"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# True - weird spacing\n",
    "with open(cell_2,\n",
    "          mode=\"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# True - weird spacing and direct call to C2\n",
    "with open(C2(),\n",
    "          mode=\"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# False - call variable is path even if there is a C2\n",
    "\n",
    "cell_2 = C2()\n",
    "path = Path(\"hello.txt\")\n",
    "    \n",
    "with open(path, \"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    \n",
    "# call fspath to make sure we are false for read\n",
    "with open(cell_2, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    \n",
    "# call fspath to make sure we are false for read\n",
    "with open(C2(), \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# False - call variable is path\n",
    "\n",
    "cell_2 = C2()\n",
    "    \n",
    "with open(Path(\"hello.txt\"), \"r+\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    \n",
    "# False 1 - call fspath to make sure we are false for read\n",
    "with open(cell_2, \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\"\n",
    "    \n",
    "# False 2 - call fspath to make sure we are false for read\n",
    "with open(C2(), \"r\") as f:\n",
    "    assert f.read() == \"hi!\\n\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmarking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# no raising or checking to compare for benchmarking\n",
    "class Base:\n",
    "    def __fspath__(self):\n",
    "        return str(Path(\"hello.txt\").resolve())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "class C1:\n",
    "    def __fspath__(self):       \n",
    "        caller_src = inspect.getsource(\n",
    "            inspect.stack()[1].frame\n",
    "        )\n",
    "        \n",
    "        if _write_from_open_call(caller_src):\n",
    "            raise Exception(\"No writing!\")\n",
    "        \n",
    "        return str(Path(\"hello.txt\").resolve())\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's redefine to actually raise so that we can compare. We'll add three different frame getting methods:\n",
    "\n",
    " - `inspect.stack` - known to be slow\n",
    " - `inspect.currentframe` - should be faster\n",
    " - `sys._getframe` - should be fastest, but CPython only + internal method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "class C2:\n",
    "    def __init__(self, method='inspect'):\n",
    "        self.method = method\n",
    "    \n",
    "    def __fspath__(self):\n",
    "        # different frame fetching methods have different properties\n",
    "        if self.method == 'inspect':\n",
    "            frame = inspect.stack()[1].frame\n",
    "        elif self.method == 'currentframe':\n",
    "            frame = inspect.currentframe().f_back\n",
    "        else:\n",
    "            frame = sys._getframe().f_back\n",
    "        \n",
    "        # same getsource\n",
    "        caller_src = inspect.getsource(frame)\n",
    "\n",
    "        # also get local variables in the frame\n",
    "        caller_local_variables = frame.f_locals\n",
    "        \n",
    "        # get all the instances in the previous frame of our class\n",
    "        instances_of_type = [\n",
    "            varname for varname, instance in caller_local_variables.items()\n",
    "            if isinstance(instance, type(self))\n",
    "        ]\n",
    "                \n",
    "        # Walk the AST of the previous frame source and see if\n",
    "        # open is called with a variable of our type...\n",
    "        if any(\n",
    "                _is_open_call_write_with_var(\n",
    "                    n,\n",
    "                    var_names=instances_of_type,\n",
    "                    var_type=type(self)\n",
    "                ) for n in ast.walk(ast.parse(caller_src))\n",
    "            ):\n",
    "            raise Exception(\"DEFINITELY no writing!\")\n",
    "            \n",
    "            \n",
    "        return str(Path(\"hello.txt\").resolve())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "95.4 µs ± 1.55 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "try:\n",
    "    with open(Base(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.88 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "try:\n",
    "    with open(C1(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.65 ms ± 127 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "try:\n",
    "    with open(C2(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "783 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "try:\n",
    "    with open(C2(method=\"currentframe\"),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "777 µs ± 13.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "try:\n",
    "    with open(C2(method=\"_getframe\"),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "589 µs ± 7.71 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "with open(Path(\"test.txt\"), \"w\") as f:\n",
    "    f.write(\"\".join([\"a\"] * 50000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "48.828125\n"
     ]
    }
   ],
   "source": [
    "# writing a 48KB file\n",
    "print(Path(\"test.txt\").stat().st_size / (1024))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reading is faster, so make larger file\n",
    "with open(Path(\"test.txt\"), \"w\") as f:\n",
    "    f.write(\"\".join([\"a\"] * 1_700_000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.621246337890625\n"
     ]
    }
   ],
   "source": [
    "# reading a 1.6MB file\n",
    "print(Path(\"test.txt\").stat().st_size / (1024 ** 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "734 µs ± 11.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "with open(Path(\"test.txt\"), \"r\") as f:\n",
    "    data = f.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare with scalene to see where is slow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scalene extension successfully loaded. Note: Scalene currently only\n",
      "supports CPU+GPU profiling inside Jupyter notebooks. For full Scalene\n",
      "profiling, use the command line version.\n"
     ]
    }
   ],
   "source": [
    "%load_ext scalene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                             [22]: % of time =  99.87% out of   6.00s.                                             </span>\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " <span style=\"font-weight: bold\"> Line </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Sys </span>│<span style=\"font-weight: bold\">                                                                                                 </span>│<span style=\"font-weight: bold\">   </span> \n",
       "       │<span style=\"font-weight: bold\">Python </span>│<span style=\"font-weight: bold\">native </span>│<span style=\"font-weight: bold\">%   </span>│<span style=\"font-weight: bold\">[22]                                                                                             </span>│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "     4 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   89%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   11%</span> │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">            inspect</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">stack()[</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">1</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">]</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">frame</span><span style=\"background-color: #f8f8f8\">                                                            </span> │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">s…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">&lt;…</span>  \n",
       "     2 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   89%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   11%</span> │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">C1</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">__fspath__</span><span style=\"background-color: #f8f8f8\">                                                                                   </span> │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                                             [22]: % of time =  99.87% out of   6.00s.                                             \u001b[0m\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " \u001b[1m \u001b[0m\u001b[1mLine\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mSys\u001b[0m\u001b[1m \u001b[0m│\u001b[1m                                                                                                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m  \u001b[0m\u001b[1m \u001b[0m \n",
       "       │\u001b[1mPython\u001b[0m\u001b[1m \u001b[0m│\u001b[1mnative\u001b[0m\u001b[1m \u001b[0m│\u001b[1m%  \u001b[0m\u001b[1m \u001b[0m│\u001b[1m[22]                                                                                            \u001b[0m\u001b[1m \u001b[0m│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "     4 │\u001b[1;31m   89%\u001b[0m │\u001b[1;31m   11%\u001b[0m │    │\u001b[38;2;0;0;0;48;2;248;248;248m            \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minspect\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mstack\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m[\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m1\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m]\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[48;2;248;248;248m                                                            \u001b[0m │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3ms…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3m<…\u001b[0m  \n",
       "     2 │\u001b[1;31m   89%\u001b[0m │\u001b[1;31m   11%\u001b[0m │    │\u001b[38;2;0;0;0;48;2;248;248;248mC1\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m__fspath__\u001b[0m\u001b[48;2;248;248;248m                                                                                   \u001b[0m │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">generated by the <a href=\"https://github.com/plasma-umass/scalene\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">scalene</span></a> profiler                                                                                                   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "generated by the \u001b]8;id=1617518848.389377-908377;https://github.com/plasma-umass/scalene\u001b\\\u001b[94mscalene\u001b[0m\u001b]8;;\u001b\\ profiler                                                                                                   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%scalene --reduced-profile --html\n",
    "\n",
    "for i in range(1000):\n",
    "    try:\n",
    "        with open(C1(),\n",
    "              mode=\"r+\") as f:\n",
    "            assert f.read() == \"hi!\\n\"\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                             [23]: % of time =  99.56% out of   6.16s.                                             </span>\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " <span style=\"font-weight: bold\"> Line </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Sys </span>│<span style=\"font-weight: bold\">                                                                                                 </span>│<span style=\"font-weight: bold\">   </span> \n",
       "       │<span style=\"font-weight: bold\">Python </span>│<span style=\"font-weight: bold\">native </span>│<span style=\"font-weight: bold\">%   </span>│<span style=\"font-weight: bold\">[23]                                                                                             </span>│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    10 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   79%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   10%</span> │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">            frame </span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">=</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> inspect</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">stack()[</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">1</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">]</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">frame</span><span style=\"background-color: #f8f8f8\">                                                    </span> │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    17 │    7% │       │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">        caller_src </span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">=</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> inspect</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">getsource(frame)</span><span style=\"background-color: #f8f8f8\">                                                   </span> │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    35 │    2% │       │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">                ) </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">for</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> n </span><span style=\"color: #aa22ff; text-decoration-color: #aa22ff; background-color: #f8f8f8; font-weight: bold\">in</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">walk(ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">parse(caller_src))</span><span style=\"background-color: #f8f8f8\">                                      </span> │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">s…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">&lt;…</span>  \n",
       "     7 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   88%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   11%</span> │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">C2</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">__fspath__</span><span style=\"background-color: #f8f8f8\">                                                                                   </span> │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                                             [23]: % of time =  99.56% out of   6.16s.                                             \u001b[0m\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " \u001b[1m \u001b[0m\u001b[1mLine\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mSys\u001b[0m\u001b[1m \u001b[0m│\u001b[1m                                                                                                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m  \u001b[0m\u001b[1m \u001b[0m \n",
       "       │\u001b[1mPython\u001b[0m\u001b[1m \u001b[0m│\u001b[1mnative\u001b[0m\u001b[1m \u001b[0m│\u001b[1m%  \u001b[0m\u001b[1m \u001b[0m│\u001b[1m[23]                                                                                            \u001b[0m\u001b[1m \u001b[0m│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    10 │\u001b[1;31m   79%\u001b[0m │\u001b[1;31m   10%\u001b[0m │    │\u001b[38;2;0;0;0;48;2;248;248;248m            \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m=\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minspect\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mstack\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m[\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m1\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m]\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[48;2;248;248;248m                                                    \u001b[0m │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    17 │    7% │       │    │\u001b[38;2;0;0;0;48;2;248;248;248m        \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m=\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minspect\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mgetsource\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                                   \u001b[0m │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    35 │    2% │       │    │\u001b[38;2;0;0;0;48;2;248;248;248m                \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mfor\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mn\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;170;34;255;48;2;248;248;248min\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mwalk\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mparse\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                      \u001b[0m │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3ms…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3m<…\u001b[0m  \n",
       "     7 │\u001b[1;31m   88%\u001b[0m │\u001b[1;31m   11%\u001b[0m │    │\u001b[38;2;0;0;0;48;2;248;248;248mC2\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m__fspath__\u001b[0m\u001b[48;2;248;248;248m                                                                                   \u001b[0m │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">generated by the <a href=\"https://github.com/plasma-umass/scalene\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">scalene</span></a> profiler                                                                                                   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "generated by the \u001b]8;id=1617518854.577183-510445;https://github.com/plasma-umass/scalene\u001b\\\u001b[94mscalene\u001b[0m\u001b]8;;\u001b\\ profiler                                                                                                   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%scalene --reduced-profile --html\n",
    "\n",
    "for i in range(1000):\n",
    "    try:\n",
    "        with open(C2(),\n",
    "              mode=\"r+\") as f:\n",
    "            assert f.read() == \"hi!\\n\"\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                             [23]: % of time =  97.30% out of   0.55s.                                             </span>\n",
       "       ╷       ╷       ╷     ╷                                                                                                 ╷   \n",
       " <span style=\"font-weight: bold\"> Line </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Sys  </span>│<span style=\"font-weight: bold\">                                                                                                 </span>│<span style=\"font-weight: bold\">  </span> \n",
       "       │<span style=\"font-weight: bold\">Python </span>│<span style=\"font-weight: bold\">native </span>│<span style=\"font-weight: bold\">%    </span>│<span style=\"font-weight: bold\">[23]                                                                                             </span>│   \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━╸\n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    17 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   76%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   10%</span> │  2% │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">        caller_src </span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">=</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> inspect</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">getsource(frame)</span><span style=\"background-color: #f8f8f8\">                                                   </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    24 │    2% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">            varname </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">for</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> varname, instance </span><span style=\"color: #aa22ff; text-decoration-color: #aa22ff; background-color: #f8f8f8; font-weight: bold\">in</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> caller_local_variables</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">items()</span><span style=\"background-color: #f8f8f8\">                     </span> │   \n",
       "    25 │    3% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">            </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">if</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8\">isinstance</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">(instance, </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8\">type</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">(</span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8\">self</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">))</span><span style=\"background-color: #f8f8f8\">                                                 </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    31 │    4% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">                _is_open_call_write_with_var(</span><span style=\"background-color: #f8f8f8\">                                                   </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    35 │    2% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">                ) </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">for</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> n </span><span style=\"color: #aa22ff; text-decoration-color: #aa22ff; background-color: #f8f8f8; font-weight: bold\">in</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">walk(ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">parse(caller_src))</span><span style=\"background-color: #f8f8f8\">                                      </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "       │       │       │     │                                                                                                 │   \n",
       "╶──────┼───────┼───────┼─────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼──╴\n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "     7 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   86%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   11%</span> │  2% │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">C2</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">__fspath__</span><span style=\"background-color: #f8f8f8\">                                                                                   </span> │   \n",
       "       ╵       ╵       ╵     ╵                                                                                                 ╵   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                                             [23]: % of time =  97.30% out of   0.55s.                                             \u001b[0m\n",
       "       ╷       ╷       ╷     ╷                                                                                                 ╷   \n",
       " \u001b[1m \u001b[0m\u001b[1mLine\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mSys \u001b[0m\u001b[1m \u001b[0m│\u001b[1m                                                                                                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1m \u001b[0m \n",
       "       │\u001b[1mPython\u001b[0m\u001b[1m \u001b[0m│\u001b[1mnative\u001b[0m\u001b[1m \u001b[0m│\u001b[1m%   \u001b[0m\u001b[1m \u001b[0m│\u001b[1m[23]                                                                                            \u001b[0m\u001b[1m \u001b[0m│   \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━╸\n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    17 │\u001b[1;31m   76%\u001b[0m │\u001b[1;31m   10%\u001b[0m │  2% │\u001b[38;2;0;0;0;48;2;248;248;248m        \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m=\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minspect\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mgetsource\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                                   \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    24 │    2% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m            \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mvarname\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mfor\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mvarname\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m,\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minstance\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;170;34;255;48;2;248;248;248min\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_local_variables\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mitems\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                     \u001b[0m │   \n",
       "    25 │    3% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m            \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mif\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;128;0;48;2;248;248;248misinstance\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minstance\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m,\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;128;0;48;2;248;248;248mtype\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;128;0;48;2;248;248;248mself\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                                 \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    31 │    4% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m                \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m_is_open_call_write_with_var\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[48;2;248;248;248m                                                   \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    35 │    2% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m                \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mfor\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mn\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;170;34;255;48;2;248;248;248min\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mwalk\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mparse\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                      \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "       │       │       │     │                                                                                                 │   \n",
       "╶──────┼───────┼───────┼─────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼──╴\n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "     7 │\u001b[1;31m   86%\u001b[0m │\u001b[1;31m   11%\u001b[0m │  2% │\u001b[38;2;0;0;0;48;2;248;248;248mC2\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m__fspath__\u001b[0m\u001b[48;2;248;248;248m                                                                                   \u001b[0m │   \n",
       "       ╵       ╵       ╵     ╵                                                                                                 ╵   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                             [13]: % of time =   2.70% out of   0.55s.                                             </span>\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " <span style=\"font-weight: bold\"> Line </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Sys </span>│<span style=\"font-weight: bold\">                                                                                                 </span>│<span style=\"font-weight: bold\">   </span> \n",
       "       │<span style=\"font-weight: bold\">Python </span>│<span style=\"font-weight: bold\">native </span>│<span style=\"font-weight: bold\">%   </span>│<span style=\"font-weight: bold\">[13]                                                                                             </span>│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    13 │    2% │       │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">    </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">if</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> </span><span style=\"color: #aa22ff; text-decoration-color: #aa22ff; background-color: #f8f8f8; font-weight: bold\">not</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8\">isinstance</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">(ast_node, ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">Call):</span><span style=\"background-color: #f8f8f8\">                                                      </span> │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">s…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">f…</span>  \n",
       "       │       │       │    │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">&lt;…</span>  \n",
       "     5 │    2% │       │    │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">C2</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">_is_open_call_write_with_var</span><span style=\"background-color: #f8f8f8\">                                                                 </span> │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                                             [13]: % of time =   2.70% out of   0.55s.                                             \u001b[0m\n",
       "       ╷       ╷       ╷    ╷                                                                                                 ╷    \n",
       " \u001b[1m \u001b[0m\u001b[1mLine\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mSys\u001b[0m\u001b[1m \u001b[0m│\u001b[1m                                                                                                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m  \u001b[0m\u001b[1m \u001b[0m \n",
       "       │\u001b[1mPython\u001b[0m\u001b[1m \u001b[0m│\u001b[1mnative\u001b[0m\u001b[1m \u001b[0m│\u001b[1m%  \u001b[0m\u001b[1m \u001b[0m│\u001b[1m[13]                                                                                            \u001b[0m\u001b[1m \u001b[0m│    \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━╸\n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "    13 │    2% │       │    │\u001b[38;2;0;0;0;48;2;248;248;248m    \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mif\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;170;34;255;48;2;248;248;248mnot\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;128;0;48;2;248;248;248misinstance\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast_node\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m,\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mCall\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m:\u001b[0m\u001b[48;2;248;248;248m                                                      \u001b[0m │    \n",
       "   ... │       │       │    │                                                                                                 │    \n",
       "       │       │       │    │                                                                                                 │    \n",
       "╶──────┼───────┼───────┼────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼───╴\n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3ms…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3mf…\u001b[0m  \n",
       "       │       │       │    │                                                                                                 │\u001b[1;3m<…\u001b[0m  \n",
       "     5 │    2% │       │    │\u001b[38;2;0;0;0;48;2;248;248;248mC2\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m_is_open_call_write_with_var\u001b[0m\u001b[48;2;248;248;248m                                                                 \u001b[0m │    \n",
       "       ╵       ╵       ╵    ╵                                                                                                 ╵    \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">generated by the <a href=\"https://github.com/plasma-umass/scalene\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">scalene</span></a> profiler                                                                                                   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "generated by the \u001b]8;id=1617518855.1870432-91386;https://github.com/plasma-umass/scalene\u001b\\\u001b[94mscalene\u001b[0m\u001b]8;;\u001b\\ profiler                                                                                                   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%scalene --reduced-profile --html\n",
    "\n",
    "for i in range(1000):\n",
    "    try:\n",
    "        with open(C2(method=\"currentfrmae\"),\n",
    "              mode=\"r+\") as f:\n",
    "            assert f.read() == \"hi!\\n\"\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                             [23]: % of time = 100.00% out of   0.56s.                                             </span>\n",
       "       ╷       ╷       ╷     ╷                                                                                                 ╷   \n",
       " <span style=\"font-weight: bold\"> Line </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Time % </span>│<span style=\"font-weight: bold\">Sys  </span>│<span style=\"font-weight: bold\">                                                                                                 </span>│<span style=\"font-weight: bold\">  </span> \n",
       "       │<span style=\"font-weight: bold\">Python </span>│<span style=\"font-weight: bold\">native </span>│<span style=\"font-weight: bold\">%    </span>│<span style=\"font-weight: bold\">[23]                                                                                             </span>│   \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━╸\n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "     7 │    2% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">    </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">def</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> </span><span style=\"color: #0000ff; text-decoration-color: #0000ff; background-color: #f8f8f8\">__fspath__</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">(</span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8\">self</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">):</span><span style=\"background-color: #f8f8f8\">                                                                       </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    17 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   72%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">    9%</span> │  2% │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">        caller_src </span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">=</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> inspect</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">getsource(frame)</span><span style=\"background-color: #f8f8f8\">                                                   </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    31 │    1% │       │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">                _is_open_call_write_with_var(</span><span style=\"background-color: #f8f8f8\">                                                   </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    35 │   14% │    2% │     │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">                ) </span><span style=\"color: #008000; text-decoration-color: #008000; background-color: #f8f8f8; font-weight: bold\">for</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> n </span><span style=\"color: #aa22ff; text-decoration-color: #aa22ff; background-color: #f8f8f8; font-weight: bold\">in</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\"> ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">walk(ast</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">parse(caller_src))</span><span style=\"background-color: #f8f8f8\">                                      </span> │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "       │       │       │     │                                                                                                 │   \n",
       "╶──────┼───────┼───────┼─────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼──╴\n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "       │       │       │     │                                                                                                 │<span style=\"font-weight: bold; font-style: italic\">…</span>  \n",
       "     7 │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   89%</span> │<span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">   11%</span> │  2% │<span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">C2</span><span style=\"color: #666666; text-decoration-color: #666666; background-color: #f8f8f8\">.</span><span style=\"color: #000000; text-decoration-color: #000000; background-color: #f8f8f8\">__fspath__</span><span style=\"background-color: #f8f8f8\">                                                                                   </span> │   \n",
       "       ╵       ╵       ╵     ╵                                                                                                 ╵   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[3m                                             [23]: % of time = 100.00% out of   0.56s.                                             \u001b[0m\n",
       "       ╷       ╷       ╷     ╷                                                                                                 ╷   \n",
       " \u001b[1m \u001b[0m\u001b[1mLine\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mTime %\u001b[0m\u001b[1m \u001b[0m│\u001b[1mSys \u001b[0m\u001b[1m \u001b[0m│\u001b[1m                                                                                                \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1m \u001b[0m \n",
       "       │\u001b[1mPython\u001b[0m\u001b[1m \u001b[0m│\u001b[1mnative\u001b[0m\u001b[1m \u001b[0m│\u001b[1m%   \u001b[0m\u001b[1m \u001b[0m│\u001b[1m[23]                                                                                            \u001b[0m\u001b[1m \u001b[0m│   \n",
       "╺━━━━━━┿━━━━━━━┿━━━━━━━┿━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━╸\n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "     7 │    2% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m    \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mdef\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;255;48;2;248;248;248m__fspath__\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;128;0;48;2;248;248;248mself\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m:\u001b[0m\u001b[48;2;248;248;248m                                                                       \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    17 │\u001b[1;31m   72%\u001b[0m │\u001b[1;31m    9%\u001b[0m │  2% │\u001b[38;2;0;0;0;48;2;248;248;248m        \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m=\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248minspect\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mgetsource\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mframe\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                                   \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    31 │    1% │       │     │\u001b[38;2;0;0;0;48;2;248;248;248m                \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m_is_open_call_write_with_var\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[48;2;248;248;248m                                                   \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "    35 │   14% │    2% │     │\u001b[38;2;0;0;0;48;2;248;248;248m                \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;0;128;0;48;2;248;248;248mfor\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mn\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[1;38;2;170;34;255;48;2;248;248;248min\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m \u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mwalk\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mast\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mparse\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m(\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248mcaller_src\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m)\u001b[0m\u001b[48;2;248;248;248m                                      \u001b[0m │   \n",
       "   ... │       │       │     │                                                                                                 │   \n",
       "       │       │       │     │                                                                                                 │   \n",
       "╶──────┼───────┼───────┼─────┼─────────────────────────────────────────────────────────────────────────────────────────────────┼──╴\n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "       │       │       │     │                                                                                                 │\u001b[1;3m…\u001b[0m  \n",
       "     7 │\u001b[1;31m   89%\u001b[0m │\u001b[1;31m   11%\u001b[0m │  2% │\u001b[38;2;0;0;0;48;2;248;248;248mC2\u001b[0m\u001b[38;2;102;102;102;48;2;248;248;248m.\u001b[0m\u001b[38;2;0;0;0;48;2;248;248;248m__fspath__\u001b[0m\u001b[48;2;248;248;248m                                                                                   \u001b[0m │   \n",
       "       ╵       ╵       ╵     ╵                                                                                                 ╵   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">generated by the <a href=\"https://github.com/plasma-umass/scalene\"><span style=\"color: #0000ff; text-decoration-color: #0000ff\">scalene</span></a> profiler                                                                                                   \n",
       "</pre>\n"
      ],
      "text/plain": [
       "generated by the \u001b]8;id=1617518855.7806919-754780;https://github.com/plasma-umass/scalene\u001b\\\u001b[94mscalene\u001b[0m\u001b]8;;\u001b\\ profiler                                                                                                   \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%scalene --reduced-profile --html\n",
    "\n",
    "for i in range(1000):\n",
    "    try:\n",
    "        with open(C2(method=\"_getframe\"),\n",
    "              mode=\"r+\") as f:\n",
    "            assert f.read() == \"hi!\\n\"\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What if we make the context huge by adding an entire spellchecker based on Peter Norvig's approach?\n",
    "\n",
    "https://github.com/barrust/pyspellchecker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "160 µs ± 7.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "\"\"\" Additional utility functions \"\"\"\n",
    "import contextlib\n",
    "import gzip\n",
    "import functools\n",
    "import re\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fail_after(version):\n",
    "    \"\"\" Decorator to add to tests to ensure that they fail if a deprecated\n",
    "        feature is not removed before the specified version\n",
    "\n",
    "        Args:\n",
    "            version (str): The version to check against \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def test_inner(*args, **kwargs):\n",
    "            if [int(x) for x in version.split(\".\")] <= [\n",
    "                int(x) for x in __version__.split(\".\")\n",
    "            ]:\n",
    "                msg = \"The function {} must be fully removed as it is depricated and must be removed by version {}\".format(\n",
    "                    func.__name__, version\n",
    "                )\n",
    "                raise AssertionError(msg)\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        return test_inner\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def deprecated(message=\"\"):\n",
    "    \"\"\" A simplistic decorator to mark functions as deprecated. The function\n",
    "        will pass a message to the user on the first use of the function\n",
    "\n",
    "        Args:\n",
    "            message (str): The message to display if the function is deprecated\n",
    "    \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def function_wrapper(*args, **kwargs):\n",
    "            func_name = func.__name__\n",
    "            if func_name not in function_wrapper.deprecated_items:\n",
    "                msg = \"Function {} is now deprecated! {}\".format(func.__name__, message)\n",
    "                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)\n",
    "                function_wrapper.deprecated_items.add(func_name)\n",
    "\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        # set this up the first time the decorator is called\n",
    "        function_wrapper.deprecated_items = set()\n",
    "\n",
    "        return function_wrapper\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def ensure_unicode(_str, encoding=\"utf-8\"):\n",
    "    \"\"\" Simplify checking if passed in data are bytes or a string and decode\n",
    "        bytes into unicode.\n",
    "\n",
    "        Args:\n",
    "            _str (str): The input string (possibly bytes)\n",
    "            encoding (str): The encoding to use if input is bytes\n",
    "        Returns:\n",
    "            str: The encoded string\n",
    "    \"\"\"\n",
    "    if isinstance(_str, bytes):\n",
    "        return _str.decode(encoding)\n",
    "    return _str\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def __gzip_read(filename, mode=\"rb\", encoding=\"UTF-8\"):\n",
    "    \"\"\" Context manager to correctly handle the decoding of the output of \\\n",
    "        the gzip file\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            mode (str): The mode to read the data\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the gzip file read\n",
    "    \"\"\"\n",
    "    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:\n",
    "        yield fobj.read()\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def load_file(filename, encoding):\n",
    "    \"\"\" Context manager to handle opening a gzip or text file correctly and\n",
    "        reading all the data\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the file read\n",
    "    \"\"\"\n",
    "    if filename[-3:].lower() == \".gz\":\n",
    "        with __gzip_read(filename, mode=\"rt\", encoding=encoding) as data:\n",
    "            yield data\n",
    "    else:\n",
    "        with open(filename, mode=\"r\", encoding=encoding) as fobj:\n",
    "            yield fobj.read()\n",
    "\n",
    "\n",
    "def write_file(filepath, encoding, gzipped, data):\n",
    "    \"\"\" Write the data to file either as a gzip file or text based on the\n",
    "        gzipped parameter\n",
    "\n",
    "        Args:\n",
    "            filepath (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "            gzipped (bool): Whether the file should be gzipped or not\n",
    "            data (str): The data to be written out\n",
    "    \"\"\"\n",
    "    if gzipped:\n",
    "        with gzip.open(filepath, \"wt\") as fobj:\n",
    "            fobj.write(data)\n",
    "    else:\n",
    "        with open(filepath, \"w\", encoding=encoding) as fobj:\n",
    "            fobj.write(data)\n",
    "\n",
    "\n",
    "def _parse_into_words(text):\n",
    "    \"\"\" Parse the text into words; currently removes punctuation except for\n",
    "        apostrophies.\n",
    "\n",
    "        Args:\n",
    "            text (str): The text to split into words\n",
    "    \"\"\"\n",
    "    # see: https://stackoverflow.com/a/12705513\n",
    "    return re.findall(r\"(\\w[\\w']*\\w|\\w)\", text)\n",
    "\n",
    "\n",
    "\"\"\" SpellChecker Module; simple, intuitive spell checker based on the post by\n",
    "    Peter Norvig. See: https://norvig.com/spell-correct.html \"\"\"\n",
    "import gzip\n",
    "import json\n",
    "import pkgutil\n",
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "class SpellChecker(object):\n",
    "    \"\"\" The SpellChecker class encapsulates the basics needed to accomplish a\n",
    "        simple spell checking algorithm. It is based on the work by\n",
    "        Peter Norvig (https://norvig.com/spell-correct.html)\n",
    "\n",
    "        Args:\n",
    "            language (str): The language of the dictionary to load or None \\\n",
    "            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \\\n",
    "            `pt` and `ru`. Defaults to `en`. A list of languages may be \\\n",
    "            provided and all languages will be loaded.\n",
    "            local_dictionary (str): The path to a locally stored word \\\n",
    "            frequency dictionary; if provided, no language will be loaded\n",
    "            distance (int): The edit distance to use. Defaults to 2.\n",
    "            case_sensitive (bool): Flag to use a case sensitive dictionary or \\\n",
    "            not, only available when not using a language dictionary.\n",
    "        Note:\n",
    "            Using a case sensitive dictionary can be slow to correct words.\"\"\"\n",
    "\n",
    "    __slots__ = [\"_distance\", \"_word_frequency\", \"_tokenizer\", \"_case_sensitive\"]\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        language=\"en\",\n",
    "        local_dictionary=None,\n",
    "        distance=2,\n",
    "        tokenizer=None,\n",
    "        case_sensitive=False,\n",
    "    ):\n",
    "        self._distance = None\n",
    "        self.distance = distance  # use the setter value check\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "        self._case_sensitive = case_sensitive if not language else False\n",
    "        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)\n",
    "\n",
    "        if local_dictionary:\n",
    "            self._word_frequency.load_dictionary(local_dictionary)\n",
    "        elif language:\n",
    "            if not isinstance(language, list):\n",
    "                language = [language]\n",
    "            for lang in language:\n",
    "                filename = \"resources/{}.json.gz\".format(lang.lower())\n",
    "                try:\n",
    "                    json_open = pkgutil.get_data(\"spellchecker\", filename)\n",
    "                except FileNotFoundError:\n",
    "                    msg = (\n",
    "                        \"The provided dictionary language ({}) does not \" \"exist!\"\n",
    "                    ).format(lang.lower())\n",
    "                    raise ValueError(msg)\n",
    "\n",
    "                lang_dict = json.loads(gzip.decompress(json_open).decode(\"utf-8\"))\n",
    "                self._word_frequency.load_json(lang_dict)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" setup easier known checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return key in self._word_frequency\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" setup easier frequency checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return self._word_frequency[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" setup iter support \"\"\"\n",
    "        for word in self._word_frequency.dictionary:\n",
    "            yield word\n",
    "\n",
    "    @property\n",
    "    def word_frequency(self):\n",
    "        \"\"\" WordFrequency: An encapsulation of the word frequency `dictionary`\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._word_frequency\n",
    "\n",
    "    @property\n",
    "    def distance(self):\n",
    "        \"\"\" int: The maximum edit distance to calculate\n",
    "\n",
    "            Note:\n",
    "                Valid values are 1 or 2; if an invalid value is passed, \\\n",
    "                defaults to 2 \"\"\"\n",
    "        return self._distance\n",
    "\n",
    "    @distance.setter\n",
    "    def distance(self, val):\n",
    "        \"\"\" set the distance parameter \"\"\"\n",
    "        tmp = 2\n",
    "        try:\n",
    "            int(val)\n",
    "            if val > 0 and val <= 2:\n",
    "                tmp = val\n",
    "        except (ValueError, TypeError):\n",
    "            pass\n",
    "        self._distance = tmp\n",
    "\n",
    "    def split_words(self, text):\n",
    "        \"\"\" Split text into individual `words` using either a simple whitespace\n",
    "            regex or the passed in tokenizer\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to split into individual words\n",
    "            Returns:\n",
    "                list(str): A listing of all words in the provided text \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        return self._tokenizer(text)\n",
    "\n",
    "    def export(self, filepath, encoding=\"utf-8\", gzipped=True):\n",
    "        \"\"\" Export the word frequency list for import in the future\n",
    "\n",
    "             Args:\n",
    "                filepath (str): The filepath to the exported dictionary\n",
    "                encoding (str): The encoding of the resulting output\n",
    "                gzipped (bool): Whether to gzip the dictionary or not \"\"\"\n",
    "        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)\n",
    "        write_file(filepath, encoding, gzipped, data)\n",
    "\n",
    "    def word_usage_frequency(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word \"\"\"\n",
    "        if not total_words:\n",
    "            total_words = self._word_frequency.total_words\n",
    "        word = ensure_unicode(word)\n",
    "        return self._word_frequency.dictionary[word] / total_words\n",
    "\n",
    "    @deprecated(\"Deprecated as of version 0.6.1; use word_usage_frequency instead\")\n",
    "    def word_probability(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary; function was a misnomar and is therefore\n",
    "            deprecated!\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word\n",
    "            Note:\n",
    "                Deprecated as of version 0.6.1; use `word_usage_frequency` \\\n",
    "                instead\n",
    "            Note:\n",
    "                Will be removed in version 0.6.3 \"\"\"\n",
    "        return self.word_usage_frequency(word, total_words)\n",
    "\n",
    "    def correction(self, word):\n",
    "        \"\"\" The most probable correct spelling for the word\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to correct\n",
    "            Returns:\n",
    "                str: The most likely candidate \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        candidates = list(self.candidates(word))\n",
    "        return max(sorted(candidates), key=self.__getitem__)\n",
    "\n",
    "    def candidates(self, word):\n",
    "        \"\"\" Generate possible spelling corrections for the provided word up to\n",
    "            an edit distance of two, if and only when needed\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate candidate spellings\n",
    "            Returns:\n",
    "                set: The set of words that are possible candidates \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        if self.known([word]):  # short-cut if word is correct already\n",
    "            return {word}\n",
    "\n",
    "        if not self._check_if_should_check(word):\n",
    "            return {word}\n",
    "\n",
    "        # get edit distance 1...\n",
    "        res = [x for x in self.edit_distance_1(word)]\n",
    "        tmp = self.known(res)\n",
    "        if tmp:\n",
    "            return tmp\n",
    "        # if still not found, use the edit distance 1 to calc edit distance 2\n",
    "        if self._distance == 2:\n",
    "            tmp = self.known([x for x in self.__edit_distance_alt(res)])\n",
    "            if tmp:\n",
    "                return tmp\n",
    "        return {word}\n",
    "\n",
    "    def known(self, words):\n",
    "        \"\"\" The subset of `words` that appear in the dictionary of words\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are in the \\\n",
    "                corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [w if self._case_sensitive else w.lower() for w in words]\n",
    "        return set(\n",
    "            w\n",
    "            for w in tmp\n",
    "            if w in self._word_frequency.dictionary and self._check_if_should_check(w)\n",
    "        )\n",
    "\n",
    "    def unknown(self, words):\n",
    "        \"\"\" The subset of `words` that do not appear in the dictionary\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are not in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are not in \\\n",
    "                the corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return set(w for w in tmp if w not in self._word_frequency.dictionary)\n",
    "\n",
    "    def edit_distance_1(self, word):\n",
    "        \"\"\" Compute all strings that are one edit away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance one from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        if self._check_if_should_check(word) is False:\n",
    "            return {word}\n",
    "        letters = self._word_frequency.letters\n",
    "        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
    "        deletes = [L + R[1:] for L, R in splits if R]\n",
    "        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "        inserts = [L + c + R for L, R in splits for c in letters]\n",
    "        return set(deletes + transposes + replaces + inserts)\n",
    "\n",
    "    def edit_distance_2(self, word):\n",
    "        \"\"\" Compute all strings that are two edits away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        return [\n",
    "            e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)\n",
    "        ]\n",
    "\n",
    "    def __edit_distance_alt(self, words):\n",
    "        \"\"\" Compute all strings that are 1 edits away from all the words using\n",
    "            only the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                words (list): The words for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided words \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]\n",
    "\n",
    "    def _check_if_should_check(self, word):\n",
    "        if len(word) == 1 and word in string.punctuation:\n",
    "            return False\n",
    "        if (\n",
    "            len(word) > self._word_frequency.longest_word_length + 3\n",
    "        ):  # magic number to allow removal of up to 2 letters.\n",
    "            return False\n",
    "        try:  # check if it is a number (int, float, etc)\n",
    "            float(word)\n",
    "            return False\n",
    "        except ValueError:\n",
    "            pass\n",
    "\n",
    "        return True\n",
    "\n",
    "\n",
    "class WordFrequency(object):\n",
    "    \"\"\" Store the `dictionary` as a word frequency list while allowing for\n",
    "        different methods to load the data and update over time \"\"\"\n",
    "\n",
    "    __slots__ = [\n",
    "        \"_dictionary\",\n",
    "        \"_total_words\",\n",
    "        \"_unique_words\",\n",
    "        \"_letters\",\n",
    "        \"_tokenizer\",\n",
    "        \"_case_sensitive\",\n",
    "        \"_longest_word_length\",\n",
    "    ]\n",
    "\n",
    "    def __init__(self, tokenizer=None, case_sensitive=False):\n",
    "        self._dictionary = Counter()\n",
    "        self._total_words = 0\n",
    "        self._unique_words = 0\n",
    "        self._letters = set()\n",
    "        self._case_sensitive = case_sensitive\n",
    "        self._longest_word_length = 0\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" turn on contains \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return key in self._dictionary\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" turn on getitem \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" turn on iter support \"\"\"\n",
    "        for word in self._dictionary:\n",
    "            yield word\n",
    "\n",
    "    def pop(self, key, default=None):\n",
    "        \"\"\" Remove the key and return the associated value or default if not\n",
    "            found\n",
    "\n",
    "            Args:\n",
    "                key (str): The key to remove\n",
    "                default (obj): The value to return if key is not present \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary.pop(key, default)\n",
    "\n",
    "    @property\n",
    "    def dictionary(self):\n",
    "        \"\"\" Counter: A counting dictionary of all words in the corpus and the \\\n",
    "            number of times each has been seen\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._dictionary\n",
    "\n",
    "    @property\n",
    "    def total_words(self):\n",
    "        \"\"\" int: The sum of all word occurances in the word frequency \\\n",
    "                 dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._total_words\n",
    "\n",
    "    @property\n",
    "    def unique_words(self):\n",
    "        \"\"\" int: The total number of unique words in the word frequency list\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._unique_words\n",
    "\n",
    "    @property\n",
    "    def letters(self):\n",
    "        \"\"\" str: The listing of all letters found within the corpus\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._letters\n",
    "\n",
    "    @property\n",
    "    def longest_word_length(self):\n",
    "        \"\"\" int: The longest word length in the dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._longest_word_length\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        \"\"\" Tokenize the provided string object into individual words\n",
    "\n",
    "            Args:\n",
    "                text (str): The string object to tokenize\n",
    "            Yields:\n",
    "                str: The next `word` in the tokenized string\n",
    "            Note:\n",
    "                This is the same as the `spellchecker.split_words()` unless \\\n",
    "                a tokenizer function was provided. \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        for word in self._tokenizer(text):\n",
    "            yield word if self._case_sensitive else word.lower()\n",
    "\n",
    "    def keys(self):\n",
    "        \"\"\" Iterator over the key of the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next key in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.words()` \"\"\"\n",
    "        for key in self._dictionary.keys():\n",
    "            yield key\n",
    "\n",
    "    def words(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.keys()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word\n",
    "\n",
    "    def items(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "                int: The number of instances in the dictionary\n",
    "            Note:\n",
    "                This is the same as `dict.items()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word, self._dictionary[word]\n",
    "\n",
    "    def load_dictionary(self, filename, encoding=\"utf-8\"):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the json (optionally gzipped) \\\n",
    "                file to be loaded\n",
    "                encoding (str): The encoding of the dictionary \"\"\"\n",
    "        with load_file(filename, encoding) as data:\n",
    "            data = data if self._case_sensitive else data.lower()\n",
    "            self._dictionary.update(json.loads(data))\n",
    "            self._update_dictionary()\n",
    "\n",
    "    def load_json(self, data):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                data (dict): The dictionary to be loaded \"\"\"\n",
    "        self._dictionary.update(data)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_text_file(self, filename, encoding=\"utf-8\", tokenizer=None):\n",
    "        \"\"\" Load in a text file from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the text file to be loaded\n",
    "                encoding (str): The encoding of the text file\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        with load_file(filename, encoding=encoding) as data:\n",
    "            self.load_text(data, tokenizer)\n",
    "\n",
    "    def load_text(self, text, tokenizer=None):\n",
    "        \"\"\" Load text from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to be loaded\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        if tokenizer:\n",
    "            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]\n",
    "        else:\n",
    "            words = self.tokenize(text)\n",
    "\n",
    "        self._dictionary.update(words)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_words(self, words):\n",
    "        \"\"\" Load a list of words from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to be loaded \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        self._dictionary.update(\n",
    "            [word if self._case_sensitive else word.lower() for word in words]\n",
    "        )\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def add(self, word):\n",
    "        \"\"\" Add a word to the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to add \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self.load_words([word])\n",
    "\n",
    "    def remove_words(self, words):\n",
    "        \"\"\" Remove a list of words from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to remove \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        for word in words:\n",
    "            self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove(self, word):\n",
    "        \"\"\" Remove a word from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to remove \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove_by_threshold(self, threshold=5):\n",
    "        \"\"\" Remove all words at, or below, the provided threshold\n",
    "\n",
    "            Args:\n",
    "                threshold (int): The threshold at which a word is to be \\\n",
    "                removed \"\"\"\n",
    "        keys = [x for x in self._dictionary.keys()]\n",
    "        for key in keys:\n",
    "            if self._dictionary[key] <= threshold:\n",
    "                self._dictionary.pop(key)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def _update_dictionary(self):\n",
    "        \"\"\" Update the word frequency object \"\"\"\n",
    "        self._longest_word_length = 0\n",
    "        self._total_words = sum(self._dictionary.values())\n",
    "        self._unique_words = len(self._dictionary.keys())\n",
    "        self._letters = set()\n",
    "        for key in self._dictionary:\n",
    "            if len(key) > self._longest_word_length:\n",
    "                self._longest_word_length = len(key)\n",
    "            self._letters.update(key)\n",
    "            \n",
    "\n",
    "try:\n",
    "    with open(Base(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.12 ms ± 37.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "\"\"\" Additional utility functions \"\"\"\n",
    "import contextlib\n",
    "import gzip\n",
    "import functools\n",
    "import re\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fail_after(version):\n",
    "    \"\"\" Decorator to add to tests to ensure that they fail if a deprecated\n",
    "        feature is not removed before the specified version\n",
    "\n",
    "        Args:\n",
    "            version (str): The version to check against \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def test_inner(*args, **kwargs):\n",
    "            if [int(x) for x in version.split(\".\")] <= [\n",
    "                int(x) for x in __version__.split(\".\")\n",
    "            ]:\n",
    "                msg = \"The function {} must be fully removed as it is depricated and must be removed by version {}\".format(\n",
    "                    func.__name__, version\n",
    "                )\n",
    "                raise AssertionError(msg)\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        return test_inner\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def deprecated(message=\"\"):\n",
    "    \"\"\" A simplistic decorator to mark functions as deprecated. The function\n",
    "        will pass a message to the user on the first use of the function\n",
    "\n",
    "        Args:\n",
    "            message (str): The message to display if the function is deprecated\n",
    "    \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def function_wrapper(*args, **kwargs):\n",
    "            func_name = func.__name__\n",
    "            if func_name not in function_wrapper.deprecated_items:\n",
    "                msg = \"Function {} is now deprecated! {}\".format(func.__name__, message)\n",
    "                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)\n",
    "                function_wrapper.deprecated_items.add(func_name)\n",
    "\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        # set this up the first time the decorator is called\n",
    "        function_wrapper.deprecated_items = set()\n",
    "\n",
    "        return function_wrapper\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def ensure_unicode(_str, encoding=\"utf-8\"):\n",
    "    \"\"\" Simplify checking if passed in data are bytes or a string and decode\n",
    "        bytes into unicode.\n",
    "\n",
    "        Args:\n",
    "            _str (str): The input string (possibly bytes)\n",
    "            encoding (str): The encoding to use if input is bytes\n",
    "        Returns:\n",
    "            str: The encoded string\n",
    "    \"\"\"\n",
    "    if isinstance(_str, bytes):\n",
    "        return _str.decode(encoding)\n",
    "    return _str\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def __gzip_read(filename, mode=\"rb\", encoding=\"UTF-8\"):\n",
    "    \"\"\" Context manager to correctly handle the decoding of the output of \\\n",
    "        the gzip file\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            mode (str): The mode to read the data\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the gzip file read\n",
    "    \"\"\"\n",
    "    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:\n",
    "        yield fobj.read()\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def load_file(filename, encoding):\n",
    "    \"\"\" Context manager to handle opening a gzip or text file correctly and\n",
    "        reading all the data\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the file read\n",
    "    \"\"\"\n",
    "    if filename[-3:].lower() == \".gz\":\n",
    "        with __gzip_read(filename, mode=\"rt\", encoding=encoding) as data:\n",
    "            yield data\n",
    "    else:\n",
    "        with open(filename, mode=\"r\", encoding=encoding) as fobj:\n",
    "            yield fobj.read()\n",
    "\n",
    "\n",
    "def write_file(filepath, encoding, gzipped, data):\n",
    "    \"\"\" Write the data to file either as a gzip file or text based on the\n",
    "        gzipped parameter\n",
    "\n",
    "        Args:\n",
    "            filepath (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "            gzipped (bool): Whether the file should be gzipped or not\n",
    "            data (str): The data to be written out\n",
    "    \"\"\"\n",
    "    if gzipped:\n",
    "        with gzip.open(filepath, \"wt\") as fobj:\n",
    "            fobj.write(data)\n",
    "    else:\n",
    "        with open(filepath, \"w\", encoding=encoding) as fobj:\n",
    "            fobj.write(data)\n",
    "\n",
    "\n",
    "def _parse_into_words(text):\n",
    "    \"\"\" Parse the text into words; currently removes punctuation except for\n",
    "        apostrophies.\n",
    "\n",
    "        Args:\n",
    "            text (str): The text to split into words\n",
    "    \"\"\"\n",
    "    # see: https://stackoverflow.com/a/12705513\n",
    "    return re.findall(r\"(\\w[\\w']*\\w|\\w)\", text)\n",
    "\n",
    "\n",
    "\"\"\" SpellChecker Module; simple, intuitive spell checker based on the post by\n",
    "    Peter Norvig. See: https://norvig.com/spell-correct.html \"\"\"\n",
    "import gzip\n",
    "import json\n",
    "import pkgutil\n",
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "class SpellChecker(object):\n",
    "    \"\"\" The SpellChecker class encapsulates the basics needed to accomplish a\n",
    "        simple spell checking algorithm. It is based on the work by\n",
    "        Peter Norvig (https://norvig.com/spell-correct.html)\n",
    "\n",
    "        Args:\n",
    "            language (str): The language of the dictionary to load or None \\\n",
    "            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \\\n",
    "            `pt` and `ru`. Defaults to `en`. A list of languages may be \\\n",
    "            provided and all languages will be loaded.\n",
    "            local_dictionary (str): The path to a locally stored word \\\n",
    "            frequency dictionary; if provided, no language will be loaded\n",
    "            distance (int): The edit distance to use. Defaults to 2.\n",
    "            case_sensitive (bool): Flag to use a case sensitive dictionary or \\\n",
    "            not, only available when not using a language dictionary.\n",
    "        Note:\n",
    "            Using a case sensitive dictionary can be slow to correct words.\"\"\"\n",
    "\n",
    "    __slots__ = [\"_distance\", \"_word_frequency\", \"_tokenizer\", \"_case_sensitive\"]\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        language=\"en\",\n",
    "        local_dictionary=None,\n",
    "        distance=2,\n",
    "        tokenizer=None,\n",
    "        case_sensitive=False,\n",
    "    ):\n",
    "        self._distance = None\n",
    "        self.distance = distance  # use the setter value check\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "        self._case_sensitive = case_sensitive if not language else False\n",
    "        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)\n",
    "\n",
    "        if local_dictionary:\n",
    "            self._word_frequency.load_dictionary(local_dictionary)\n",
    "        elif language:\n",
    "            if not isinstance(language, list):\n",
    "                language = [language]\n",
    "            for lang in language:\n",
    "                filename = \"resources/{}.json.gz\".format(lang.lower())\n",
    "                try:\n",
    "                    json_open = pkgutil.get_data(\"spellchecker\", filename)\n",
    "                except FileNotFoundError:\n",
    "                    msg = (\n",
    "                        \"The provided dictionary language ({}) does not \" \"exist!\"\n",
    "                    ).format(lang.lower())\n",
    "                    raise ValueError(msg)\n",
    "\n",
    "                lang_dict = json.loads(gzip.decompress(json_open).decode(\"utf-8\"))\n",
    "                self._word_frequency.load_json(lang_dict)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" setup easier known checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return key in self._word_frequency\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" setup easier frequency checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return self._word_frequency[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" setup iter support \"\"\"\n",
    "        for word in self._word_frequency.dictionary:\n",
    "            yield word\n",
    "\n",
    "    @property\n",
    "    def word_frequency(self):\n",
    "        \"\"\" WordFrequency: An encapsulation of the word frequency `dictionary`\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._word_frequency\n",
    "\n",
    "    @property\n",
    "    def distance(self):\n",
    "        \"\"\" int: The maximum edit distance to calculate\n",
    "\n",
    "            Note:\n",
    "                Valid values are 1 or 2; if an invalid value is passed, \\\n",
    "                defaults to 2 \"\"\"\n",
    "        return self._distance\n",
    "\n",
    "    @distance.setter\n",
    "    def distance(self, val):\n",
    "        \"\"\" set the distance parameter \"\"\"\n",
    "        tmp = 2\n",
    "        try:\n",
    "            int(val)\n",
    "            if val > 0 and val <= 2:\n",
    "                tmp = val\n",
    "        except (ValueError, TypeError):\n",
    "            pass\n",
    "        self._distance = tmp\n",
    "\n",
    "    def split_words(self, text):\n",
    "        \"\"\" Split text into individual `words` using either a simple whitespace\n",
    "            regex or the passed in tokenizer\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to split into individual words\n",
    "            Returns:\n",
    "                list(str): A listing of all words in the provided text \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        return self._tokenizer(text)\n",
    "\n",
    "    def export(self, filepath, encoding=\"utf-8\", gzipped=True):\n",
    "        \"\"\" Export the word frequency list for import in the future\n",
    "\n",
    "             Args:\n",
    "                filepath (str): The filepath to the exported dictionary\n",
    "                encoding (str): The encoding of the resulting output\n",
    "                gzipped (bool): Whether to gzip the dictionary or not \"\"\"\n",
    "        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)\n",
    "        write_file(filepath, encoding, gzipped, data)\n",
    "\n",
    "    def word_usage_frequency(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word \"\"\"\n",
    "        if not total_words:\n",
    "            total_words = self._word_frequency.total_words\n",
    "        word = ensure_unicode(word)\n",
    "        return self._word_frequency.dictionary[word] / total_words\n",
    "\n",
    "    @deprecated(\"Deprecated as of version 0.6.1; use word_usage_frequency instead\")\n",
    "    def word_probability(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary; function was a misnomar and is therefore\n",
    "            deprecated!\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word\n",
    "            Note:\n",
    "                Deprecated as of version 0.6.1; use `word_usage_frequency` \\\n",
    "                instead\n",
    "            Note:\n",
    "                Will be removed in version 0.6.3 \"\"\"\n",
    "        return self.word_usage_frequency(word, total_words)\n",
    "\n",
    "    def correction(self, word):\n",
    "        \"\"\" The most probable correct spelling for the word\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to correct\n",
    "            Returns:\n",
    "                str: The most likely candidate \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        candidates = list(self.candidates(word))\n",
    "        return max(sorted(candidates), key=self.__getitem__)\n",
    "\n",
    "    def candidates(self, word):\n",
    "        \"\"\" Generate possible spelling corrections for the provided word up to\n",
    "            an edit distance of two, if and only when needed\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate candidate spellings\n",
    "            Returns:\n",
    "                set: The set of words that are possible candidates \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        if self.known([word]):  # short-cut if word is correct already\n",
    "            return {word}\n",
    "\n",
    "        if not self._check_if_should_check(word):\n",
    "            return {word}\n",
    "\n",
    "        # get edit distance 1...\n",
    "        res = [x for x in self.edit_distance_1(word)]\n",
    "        tmp = self.known(res)\n",
    "        if tmp:\n",
    "            return tmp\n",
    "        # if still not found, use the edit distance 1 to calc edit distance 2\n",
    "        if self._distance == 2:\n",
    "            tmp = self.known([x for x in self.__edit_distance_alt(res)])\n",
    "            if tmp:\n",
    "                return tmp\n",
    "        return {word}\n",
    "\n",
    "    def known(self, words):\n",
    "        \"\"\" The subset of `words` that appear in the dictionary of words\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are in the \\\n",
    "                corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [w if self._case_sensitive else w.lower() for w in words]\n",
    "        return set(\n",
    "            w\n",
    "            for w in tmp\n",
    "            if w in self._word_frequency.dictionary and self._check_if_should_check(w)\n",
    "        )\n",
    "\n",
    "    def unknown(self, words):\n",
    "        \"\"\" The subset of `words` that do not appear in the dictionary\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are not in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are not in \\\n",
    "                the corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return set(w for w in tmp if w not in self._word_frequency.dictionary)\n",
    "\n",
    "    def edit_distance_1(self, word):\n",
    "        \"\"\" Compute all strings that are one edit away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance one from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        if self._check_if_should_check(word) is False:\n",
    "            return {word}\n",
    "        letters = self._word_frequency.letters\n",
    "        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
    "        deletes = [L + R[1:] for L, R in splits if R]\n",
    "        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "        inserts = [L + c + R for L, R in splits for c in letters]\n",
    "        return set(deletes + transposes + replaces + inserts)\n",
    "\n",
    "    def edit_distance_2(self, word):\n",
    "        \"\"\" Compute all strings that are two edits away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        return [\n",
    "            e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)\n",
    "        ]\n",
    "\n",
    "    def __edit_distance_alt(self, words):\n",
    "        \"\"\" Compute all strings that are 1 edits away from all the words using\n",
    "            only the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                words (list): The words for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided words \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]\n",
    "\n",
    "    def _check_if_should_check(self, word):\n",
    "        if len(word) == 1 and word in string.punctuation:\n",
    "            return False\n",
    "        if (\n",
    "            len(word) > self._word_frequency.longest_word_length + 3\n",
    "        ):  # magic number to allow removal of up to 2 letters.\n",
    "            return False\n",
    "        try:  # check if it is a number (int, float, etc)\n",
    "            float(word)\n",
    "            return False\n",
    "        except ValueError:\n",
    "            pass\n",
    "\n",
    "        return True\n",
    "\n",
    "\n",
    "class WordFrequency(object):\n",
    "    \"\"\" Store the `dictionary` as a word frequency list while allowing for\n",
    "        different methods to load the data and update over time \"\"\"\n",
    "\n",
    "    __slots__ = [\n",
    "        \"_dictionary\",\n",
    "        \"_total_words\",\n",
    "        \"_unique_words\",\n",
    "        \"_letters\",\n",
    "        \"_tokenizer\",\n",
    "        \"_case_sensitive\",\n",
    "        \"_longest_word_length\",\n",
    "    ]\n",
    "\n",
    "    def __init__(self, tokenizer=None, case_sensitive=False):\n",
    "        self._dictionary = Counter()\n",
    "        self._total_words = 0\n",
    "        self._unique_words = 0\n",
    "        self._letters = set()\n",
    "        self._case_sensitive = case_sensitive\n",
    "        self._longest_word_length = 0\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" turn on contains \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return key in self._dictionary\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" turn on getitem \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" turn on iter support \"\"\"\n",
    "        for word in self._dictionary:\n",
    "            yield word\n",
    "\n",
    "    def pop(self, key, default=None):\n",
    "        \"\"\" Remove the key and return the associated value or default if not\n",
    "            found\n",
    "\n",
    "            Args:\n",
    "                key (str): The key to remove\n",
    "                default (obj): The value to return if key is not present \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary.pop(key, default)\n",
    "\n",
    "    @property\n",
    "    def dictionary(self):\n",
    "        \"\"\" Counter: A counting dictionary of all words in the corpus and the \\\n",
    "            number of times each has been seen\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._dictionary\n",
    "\n",
    "    @property\n",
    "    def total_words(self):\n",
    "        \"\"\" int: The sum of all word occurances in the word frequency \\\n",
    "                 dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._total_words\n",
    "\n",
    "    @property\n",
    "    def unique_words(self):\n",
    "        \"\"\" int: The total number of unique words in the word frequency list\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._unique_words\n",
    "\n",
    "    @property\n",
    "    def letters(self):\n",
    "        \"\"\" str: The listing of all letters found within the corpus\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._letters\n",
    "\n",
    "    @property\n",
    "    def longest_word_length(self):\n",
    "        \"\"\" int: The longest word length in the dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._longest_word_length\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        \"\"\" Tokenize the provided string object into individual words\n",
    "\n",
    "            Args:\n",
    "                text (str): The string object to tokenize\n",
    "            Yields:\n",
    "                str: The next `word` in the tokenized string\n",
    "            Note:\n",
    "                This is the same as the `spellchecker.split_words()` unless \\\n",
    "                a tokenizer function was provided. \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        for word in self._tokenizer(text):\n",
    "            yield word if self._case_sensitive else word.lower()\n",
    "\n",
    "    def keys(self):\n",
    "        \"\"\" Iterator over the key of the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next key in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.words()` \"\"\"\n",
    "        for key in self._dictionary.keys():\n",
    "            yield key\n",
    "\n",
    "    def words(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.keys()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word\n",
    "\n",
    "    def items(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "                int: The number of instances in the dictionary\n",
    "            Note:\n",
    "                This is the same as `dict.items()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word, self._dictionary[word]\n",
    "\n",
    "    def load_dictionary(self, filename, encoding=\"utf-8\"):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the json (optionally gzipped) \\\n",
    "                file to be loaded\n",
    "                encoding (str): The encoding of the dictionary \"\"\"\n",
    "        with load_file(filename, encoding) as data:\n",
    "            data = data if self._case_sensitive else data.lower()\n",
    "            self._dictionary.update(json.loads(data))\n",
    "            self._update_dictionary()\n",
    "\n",
    "    def load_json(self, data):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                data (dict): The dictionary to be loaded \"\"\"\n",
    "        self._dictionary.update(data)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_text_file(self, filename, encoding=\"utf-8\", tokenizer=None):\n",
    "        \"\"\" Load in a text file from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the text file to be loaded\n",
    "                encoding (str): The encoding of the text file\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        with load_file(filename, encoding=encoding) as data:\n",
    "            self.load_text(data, tokenizer)\n",
    "\n",
    "    def load_text(self, text, tokenizer=None):\n",
    "        \"\"\" Load text from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to be loaded\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        if tokenizer:\n",
    "            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]\n",
    "        else:\n",
    "            words = self.tokenize(text)\n",
    "\n",
    "        self._dictionary.update(words)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_words(self, words):\n",
    "        \"\"\" Load a list of words from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to be loaded \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        self._dictionary.update(\n",
    "            [word if self._case_sensitive else word.lower() for word in words]\n",
    "        )\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def add(self, word):\n",
    "        \"\"\" Add a word to the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to add \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self.load_words([word])\n",
    "\n",
    "    def remove_words(self, words):\n",
    "        \"\"\" Remove a list of words from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to remove \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        for word in words:\n",
    "            self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove(self, word):\n",
    "        \"\"\" Remove a word from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to remove \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove_by_threshold(self, threshold=5):\n",
    "        \"\"\" Remove all words at, or below, the provided threshold\n",
    "\n",
    "            Args:\n",
    "                threshold (int): The threshold at which a word is to be \\\n",
    "                removed \"\"\"\n",
    "        keys = [x for x in self._dictionary.keys()]\n",
    "        for key in keys:\n",
    "            if self._dictionary[key] <= threshold:\n",
    "                self._dictionary.pop(key)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def _update_dictionary(self):\n",
    "        \"\"\" Update the word frequency object \"\"\"\n",
    "        self._longest_word_length = 0\n",
    "        self._total_words = sum(self._dictionary.values())\n",
    "        self._unique_words = len(self._dictionary.keys())\n",
    "        self._letters = set()\n",
    "        for key in self._dictionary:\n",
    "            if len(key) > self._longest_word_length:\n",
    "                self._longest_word_length = len(key)\n",
    "            self._letters.update(key)\n",
    "            \n",
    "            \n",
    "try:\n",
    "    with open(C1(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.31 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "\n",
    "\"\"\" Additional utility functions \"\"\"\n",
    "import contextlib\n",
    "import gzip\n",
    "import functools\n",
    "import re\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fail_after(version):\n",
    "    \"\"\" Decorator to add to tests to ensure that they fail if a deprecated\n",
    "        feature is not removed before the specified version\n",
    "\n",
    "        Args:\n",
    "            version (str): The version to check against \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def test_inner(*args, **kwargs):\n",
    "            if [int(x) for x in version.split(\".\")] <= [\n",
    "                int(x) for x in __version__.split(\".\")\n",
    "            ]:\n",
    "                msg = \"The function {} must be fully removed as it is depricated and must be removed by version {}\".format(\n",
    "                    func.__name__, version\n",
    "                )\n",
    "                raise AssertionError(msg)\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        return test_inner\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def deprecated(message=\"\"):\n",
    "    \"\"\" A simplistic decorator to mark functions as deprecated. The function\n",
    "        will pass a message to the user on the first use of the function\n",
    "\n",
    "        Args:\n",
    "            message (str): The message to display if the function is deprecated\n",
    "    \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def function_wrapper(*args, **kwargs):\n",
    "            func_name = func.__name__\n",
    "            if func_name not in function_wrapper.deprecated_items:\n",
    "                msg = \"Function {} is now deprecated! {}\".format(func.__name__, message)\n",
    "                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)\n",
    "                function_wrapper.deprecated_items.add(func_name)\n",
    "\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        # set this up the first time the decorator is called\n",
    "        function_wrapper.deprecated_items = set()\n",
    "\n",
    "        return function_wrapper\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def ensure_unicode(_str, encoding=\"utf-8\"):\n",
    "    \"\"\" Simplify checking if passed in data are bytes or a string and decode\n",
    "        bytes into unicode.\n",
    "\n",
    "        Args:\n",
    "            _str (str): The input string (possibly bytes)\n",
    "            encoding (str): The encoding to use if input is bytes\n",
    "        Returns:\n",
    "            str: The encoded string\n",
    "    \"\"\"\n",
    "    if isinstance(_str, bytes):\n",
    "        return _str.decode(encoding)\n",
    "    return _str\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def __gzip_read(filename, mode=\"rb\", encoding=\"UTF-8\"):\n",
    "    \"\"\" Context manager to correctly handle the decoding of the output of \\\n",
    "        the gzip file\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            mode (str): The mode to read the data\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the gzip file read\n",
    "    \"\"\"\n",
    "    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:\n",
    "        yield fobj.read()\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def load_file(filename, encoding):\n",
    "    \"\"\" Context manager to handle opening a gzip or text file correctly and\n",
    "        reading all the data\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the file read\n",
    "    \"\"\"\n",
    "    if filename[-3:].lower() == \".gz\":\n",
    "        with __gzip_read(filename, mode=\"rt\", encoding=encoding) as data:\n",
    "            yield data\n",
    "    else:\n",
    "        with open(filename, mode=\"r\", encoding=encoding) as fobj:\n",
    "            yield fobj.read()\n",
    "\n",
    "\n",
    "def write_file(filepath, encoding, gzipped, data):\n",
    "    \"\"\" Write the data to file either as a gzip file or text based on the\n",
    "        gzipped parameter\n",
    "\n",
    "        Args:\n",
    "            filepath (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "            gzipped (bool): Whether the file should be gzipped or not\n",
    "            data (str): The data to be written out\n",
    "    \"\"\"\n",
    "    if gzipped:\n",
    "        with gzip.open(filepath, \"wt\") as fobj:\n",
    "            fobj.write(data)\n",
    "    else:\n",
    "        with open(filepath, \"w\", encoding=encoding) as fobj:\n",
    "            fobj.write(data)\n",
    "\n",
    "\n",
    "def _parse_into_words(text):\n",
    "    \"\"\" Parse the text into words; currently removes punctuation except for\n",
    "        apostrophies.\n",
    "\n",
    "        Args:\n",
    "            text (str): The text to split into words\n",
    "    \"\"\"\n",
    "    # see: https://stackoverflow.com/a/12705513\n",
    "    return re.findall(r\"(\\w[\\w']*\\w|\\w)\", text)\n",
    "\n",
    "\n",
    "\"\"\" SpellChecker Module; simple, intuitive spell checker based on the post by\n",
    "    Peter Norvig. See: https://norvig.com/spell-correct.html \"\"\"\n",
    "import gzip\n",
    "import json\n",
    "import pkgutil\n",
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "class SpellChecker(object):\n",
    "    \"\"\" The SpellChecker class encapsulates the basics needed to accomplish a\n",
    "        simple spell checking algorithm. It is based on the work by\n",
    "        Peter Norvig (https://norvig.com/spell-correct.html)\n",
    "\n",
    "        Args:\n",
    "            language (str): The language of the dictionary to load or None \\\n",
    "            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \\\n",
    "            `pt` and `ru`. Defaults to `en`. A list of languages may be \\\n",
    "            provided and all languages will be loaded.\n",
    "            local_dictionary (str): The path to a locally stored word \\\n",
    "            frequency dictionary; if provided, no language will be loaded\n",
    "            distance (int): The edit distance to use. Defaults to 2.\n",
    "            case_sensitive (bool): Flag to use a case sensitive dictionary or \\\n",
    "            not, only available when not using a language dictionary.\n",
    "        Note:\n",
    "            Using a case sensitive dictionary can be slow to correct words.\"\"\"\n",
    "\n",
    "    __slots__ = [\"_distance\", \"_word_frequency\", \"_tokenizer\", \"_case_sensitive\"]\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        language=\"en\",\n",
    "        local_dictionary=None,\n",
    "        distance=2,\n",
    "        tokenizer=None,\n",
    "        case_sensitive=False,\n",
    "    ):\n",
    "        self._distance = None\n",
    "        self.distance = distance  # use the setter value check\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "        self._case_sensitive = case_sensitive if not language else False\n",
    "        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)\n",
    "\n",
    "        if local_dictionary:\n",
    "            self._word_frequency.load_dictionary(local_dictionary)\n",
    "        elif language:\n",
    "            if not isinstance(language, list):\n",
    "                language = [language]\n",
    "            for lang in language:\n",
    "                filename = \"resources/{}.json.gz\".format(lang.lower())\n",
    "                try:\n",
    "                    json_open = pkgutil.get_data(\"spellchecker\", filename)\n",
    "                except FileNotFoundError:\n",
    "                    msg = (\n",
    "                        \"The provided dictionary language ({}) does not \" \"exist!\"\n",
    "                    ).format(lang.lower())\n",
    "                    raise ValueError(msg)\n",
    "\n",
    "                lang_dict = json.loads(gzip.decompress(json_open).decode(\"utf-8\"))\n",
    "                self._word_frequency.load_json(lang_dict)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" setup easier known checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return key in self._word_frequency\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" setup easier frequency checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return self._word_frequency[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" setup iter support \"\"\"\n",
    "        for word in self._word_frequency.dictionary:\n",
    "            yield word\n",
    "\n",
    "    @property\n",
    "    def word_frequency(self):\n",
    "        \"\"\" WordFrequency: An encapsulation of the word frequency `dictionary`\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._word_frequency\n",
    "\n",
    "    @property\n",
    "    def distance(self):\n",
    "        \"\"\" int: The maximum edit distance to calculate\n",
    "\n",
    "            Note:\n",
    "                Valid values are 1 or 2; if an invalid value is passed, \\\n",
    "                defaults to 2 \"\"\"\n",
    "        return self._distance\n",
    "\n",
    "    @distance.setter\n",
    "    def distance(self, val):\n",
    "        \"\"\" set the distance parameter \"\"\"\n",
    "        tmp = 2\n",
    "        try:\n",
    "            int(val)\n",
    "            if val > 0 and val <= 2:\n",
    "                tmp = val\n",
    "        except (ValueError, TypeError):\n",
    "            pass\n",
    "        self._distance = tmp\n",
    "\n",
    "    def split_words(self, text):\n",
    "        \"\"\" Split text into individual `words` using either a simple whitespace\n",
    "            regex or the passed in tokenizer\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to split into individual words\n",
    "            Returns:\n",
    "                list(str): A listing of all words in the provided text \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        return self._tokenizer(text)\n",
    "\n",
    "    def export(self, filepath, encoding=\"utf-8\", gzipped=True):\n",
    "        \"\"\" Export the word frequency list for import in the future\n",
    "\n",
    "             Args:\n",
    "                filepath (str): The filepath to the exported dictionary\n",
    "                encoding (str): The encoding of the resulting output\n",
    "                gzipped (bool): Whether to gzip the dictionary or not \"\"\"\n",
    "        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)\n",
    "        write_file(filepath, encoding, gzipped, data)\n",
    "\n",
    "    def word_usage_frequency(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word \"\"\"\n",
    "        if not total_words:\n",
    "            total_words = self._word_frequency.total_words\n",
    "        word = ensure_unicode(word)\n",
    "        return self._word_frequency.dictionary[word] / total_words\n",
    "\n",
    "    @deprecated(\"Deprecated as of version 0.6.1; use word_usage_frequency instead\")\n",
    "    def word_probability(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary; function was a misnomar and is therefore\n",
    "            deprecated!\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word\n",
    "            Note:\n",
    "                Deprecated as of version 0.6.1; use `word_usage_frequency` \\\n",
    "                instead\n",
    "            Note:\n",
    "                Will be removed in version 0.6.3 \"\"\"\n",
    "        return self.word_usage_frequency(word, total_words)\n",
    "\n",
    "    def correction(self, word):\n",
    "        \"\"\" The most probable correct spelling for the word\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to correct\n",
    "            Returns:\n",
    "                str: The most likely candidate \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        candidates = list(self.candidates(word))\n",
    "        return max(sorted(candidates), key=self.__getitem__)\n",
    "\n",
    "    def candidates(self, word):\n",
    "        \"\"\" Generate possible spelling corrections for the provided word up to\n",
    "            an edit distance of two, if and only when needed\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate candidate spellings\n",
    "            Returns:\n",
    "                set: The set of words that are possible candidates \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        if self.known([word]):  # short-cut if word is correct already\n",
    "            return {word}\n",
    "\n",
    "        if not self._check_if_should_check(word):\n",
    "            return {word}\n",
    "\n",
    "        # get edit distance 1...\n",
    "        res = [x for x in self.edit_distance_1(word)]\n",
    "        tmp = self.known(res)\n",
    "        if tmp:\n",
    "            return tmp\n",
    "        # if still not found, use the edit distance 1 to calc edit distance 2\n",
    "        if self._distance == 2:\n",
    "            tmp = self.known([x for x in self.__edit_distance_alt(res)])\n",
    "            if tmp:\n",
    "                return tmp\n",
    "        return {word}\n",
    "\n",
    "    def known(self, words):\n",
    "        \"\"\" The subset of `words` that appear in the dictionary of words\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are in the \\\n",
    "                corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [w if self._case_sensitive else w.lower() for w in words]\n",
    "        return set(\n",
    "            w\n",
    "            for w in tmp\n",
    "            if w in self._word_frequency.dictionary and self._check_if_should_check(w)\n",
    "        )\n",
    "\n",
    "    def unknown(self, words):\n",
    "        \"\"\" The subset of `words` that do not appear in the dictionary\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are not in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are not in \\\n",
    "                the corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return set(w for w in tmp if w not in self._word_frequency.dictionary)\n",
    "\n",
    "    def edit_distance_1(self, word):\n",
    "        \"\"\" Compute all strings that are one edit away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance one from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        if self._check_if_should_check(word) is False:\n",
    "            return {word}\n",
    "        letters = self._word_frequency.letters\n",
    "        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
    "        deletes = [L + R[1:] for L, R in splits if R]\n",
    "        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "        inserts = [L + c + R for L, R in splits for c in letters]\n",
    "        return set(deletes + transposes + replaces + inserts)\n",
    "\n",
    "    def edit_distance_2(self, word):\n",
    "        \"\"\" Compute all strings that are two edits away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        return [\n",
    "            e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)\n",
    "        ]\n",
    "\n",
    "    def __edit_distance_alt(self, words):\n",
    "        \"\"\" Compute all strings that are 1 edits away from all the words using\n",
    "            only the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                words (list): The words for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided words \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]\n",
    "\n",
    "    def _check_if_should_check(self, word):\n",
    "        if len(word) == 1 and word in string.punctuation:\n",
    "            return False\n",
    "        if (\n",
    "            len(word) > self._word_frequency.longest_word_length + 3\n",
    "        ):  # magic number to allow removal of up to 2 letters.\n",
    "            return False\n",
    "        try:  # check if it is a number (int, float, etc)\n",
    "            float(word)\n",
    "            return False\n",
    "        except ValueError:\n",
    "            pass\n",
    "\n",
    "        return True\n",
    "\n",
    "\n",
    "class WordFrequency(object):\n",
    "    \"\"\" Store the `dictionary` as a word frequency list while allowing for\n",
    "        different methods to load the data and update over time \"\"\"\n",
    "\n",
    "    __slots__ = [\n",
    "        \"_dictionary\",\n",
    "        \"_total_words\",\n",
    "        \"_unique_words\",\n",
    "        \"_letters\",\n",
    "        \"_tokenizer\",\n",
    "        \"_case_sensitive\",\n",
    "        \"_longest_word_length\",\n",
    "    ]\n",
    "\n",
    "    def __init__(self, tokenizer=None, case_sensitive=False):\n",
    "        self._dictionary = Counter()\n",
    "        self._total_words = 0\n",
    "        self._unique_words = 0\n",
    "        self._letters = set()\n",
    "        self._case_sensitive = case_sensitive\n",
    "        self._longest_word_length = 0\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" turn on contains \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return key in self._dictionary\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" turn on getitem \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" turn on iter support \"\"\"\n",
    "        for word in self._dictionary:\n",
    "            yield word\n",
    "\n",
    "    def pop(self, key, default=None):\n",
    "        \"\"\" Remove the key and return the associated value or default if not\n",
    "            found\n",
    "\n",
    "            Args:\n",
    "                key (str): The key to remove\n",
    "                default (obj): The value to return if key is not present \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary.pop(key, default)\n",
    "\n",
    "    @property\n",
    "    def dictionary(self):\n",
    "        \"\"\" Counter: A counting dictionary of all words in the corpus and the \\\n",
    "            number of times each has been seen\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._dictionary\n",
    "\n",
    "    @property\n",
    "    def total_words(self):\n",
    "        \"\"\" int: The sum of all word occurances in the word frequency \\\n",
    "                 dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._total_words\n",
    "\n",
    "    @property\n",
    "    def unique_words(self):\n",
    "        \"\"\" int: The total number of unique words in the word frequency list\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._unique_words\n",
    "\n",
    "    @property\n",
    "    def letters(self):\n",
    "        \"\"\" str: The listing of all letters found within the corpus\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._letters\n",
    "\n",
    "    @property\n",
    "    def longest_word_length(self):\n",
    "        \"\"\" int: The longest word length in the dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._longest_word_length\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        \"\"\" Tokenize the provided string object into individual words\n",
    "\n",
    "            Args:\n",
    "                text (str): The string object to tokenize\n",
    "            Yields:\n",
    "                str: The next `word` in the tokenized string\n",
    "            Note:\n",
    "                This is the same as the `spellchecker.split_words()` unless \\\n",
    "                a tokenizer function was provided. \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        for word in self._tokenizer(text):\n",
    "            yield word if self._case_sensitive else word.lower()\n",
    "\n",
    "    def keys(self):\n",
    "        \"\"\" Iterator over the key of the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next key in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.words()` \"\"\"\n",
    "        for key in self._dictionary.keys():\n",
    "            yield key\n",
    "\n",
    "    def words(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.keys()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word\n",
    "\n",
    "    def items(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "                int: The number of instances in the dictionary\n",
    "            Note:\n",
    "                This is the same as `dict.items()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word, self._dictionary[word]\n",
    "\n",
    "    def load_dictionary(self, filename, encoding=\"utf-8\"):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the json (optionally gzipped) \\\n",
    "                file to be loaded\n",
    "                encoding (str): The encoding of the dictionary \"\"\"\n",
    "        with load_file(filename, encoding) as data:\n",
    "            data = data if self._case_sensitive else data.lower()\n",
    "            self._dictionary.update(json.loads(data))\n",
    "            self._update_dictionary()\n",
    "\n",
    "    def load_json(self, data):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                data (dict): The dictionary to be loaded \"\"\"\n",
    "        self._dictionary.update(data)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_text_file(self, filename, encoding=\"utf-8\", tokenizer=None):\n",
    "        \"\"\" Load in a text file from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the text file to be loaded\n",
    "                encoding (str): The encoding of the text file\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        with load_file(filename, encoding=encoding) as data:\n",
    "            self.load_text(data, tokenizer)\n",
    "\n",
    "    def load_text(self, text, tokenizer=None):\n",
    "        \"\"\" Load text from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to be loaded\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        if tokenizer:\n",
    "            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]\n",
    "        else:\n",
    "            words = self.tokenize(text)\n",
    "\n",
    "        self._dictionary.update(words)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_words(self, words):\n",
    "        \"\"\" Load a list of words from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to be loaded \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        self._dictionary.update(\n",
    "            [word if self._case_sensitive else word.lower() for word in words]\n",
    "        )\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def add(self, word):\n",
    "        \"\"\" Add a word to the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to add \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self.load_words([word])\n",
    "\n",
    "    def remove_words(self, words):\n",
    "        \"\"\" Remove a list of words from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to remove \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        for word in words:\n",
    "            self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove(self, word):\n",
    "        \"\"\" Remove a word from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to remove \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove_by_threshold(self, threshold=5):\n",
    "        \"\"\" Remove all words at, or below, the provided threshold\n",
    "\n",
    "            Args:\n",
    "                threshold (int): The threshold at which a word is to be \\\n",
    "                removed \"\"\"\n",
    "        keys = [x for x in self._dictionary.keys()]\n",
    "        for key in keys:\n",
    "            if self._dictionary[key] <= threshold:\n",
    "                self._dictionary.pop(key)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def _update_dictionary(self):\n",
    "        \"\"\" Update the word frequency object \"\"\"\n",
    "        self._longest_word_length = 0\n",
    "        self._total_words = sum(self._dictionary.values())\n",
    "        self._unique_words = len(self._dictionary.keys())\n",
    "        self._letters = set()\n",
    "        for key in self._dictionary:\n",
    "            if len(key) > self._longest_word_length:\n",
    "                self._longest_word_length = len(key)\n",
    "            self._letters.update(key)\n",
    "\n",
    "\n",
    "try:\n",
    "    with open(C2(),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "936 µs ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "\n",
    "\"\"\" Additional utility functions \"\"\"\n",
    "import contextlib\n",
    "import gzip\n",
    "import functools\n",
    "import re\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fail_after(version):\n",
    "    \"\"\" Decorator to add to tests to ensure that they fail if a deprecated\n",
    "        feature is not removed before the specified version\n",
    "\n",
    "        Args:\n",
    "            version (str): The version to check against \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def test_inner(*args, **kwargs):\n",
    "            if [int(x) for x in version.split(\".\")] <= [\n",
    "                int(x) for x in __version__.split(\".\")\n",
    "            ]:\n",
    "                msg = \"The function {} must be fully removed as it is depricated and must be removed by version {}\".format(\n",
    "                    func.__name__, version\n",
    "                )\n",
    "                raise AssertionError(msg)\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        return test_inner\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def deprecated(message=\"\"):\n",
    "    \"\"\" A simplistic decorator to mark functions as deprecated. The function\n",
    "        will pass a message to the user on the first use of the function\n",
    "\n",
    "        Args:\n",
    "            message (str): The message to display if the function is deprecated\n",
    "    \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def function_wrapper(*args, **kwargs):\n",
    "            func_name = func.__name__\n",
    "            if func_name not in function_wrapper.deprecated_items:\n",
    "                msg = \"Function {} is now deprecated! {}\".format(func.__name__, message)\n",
    "                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)\n",
    "                function_wrapper.deprecated_items.add(func_name)\n",
    "\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        # set this up the first time the decorator is called\n",
    "        function_wrapper.deprecated_items = set()\n",
    "\n",
    "        return function_wrapper\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def ensure_unicode(_str, encoding=\"utf-8\"):\n",
    "    \"\"\" Simplify checking if passed in data are bytes or a string and decode\n",
    "        bytes into unicode.\n",
    "\n",
    "        Args:\n",
    "            _str (str): The input string (possibly bytes)\n",
    "            encoding (str): The encoding to use if input is bytes\n",
    "        Returns:\n",
    "            str: The encoded string\n",
    "    \"\"\"\n",
    "    if isinstance(_str, bytes):\n",
    "        return _str.decode(encoding)\n",
    "    return _str\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def __gzip_read(filename, mode=\"rb\", encoding=\"UTF-8\"):\n",
    "    \"\"\" Context manager to correctly handle the decoding of the output of \\\n",
    "        the gzip file\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            mode (str): The mode to read the data\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the gzip file read\n",
    "    \"\"\"\n",
    "    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:\n",
    "        yield fobj.read()\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def load_file(filename, encoding):\n",
    "    \"\"\" Context manager to handle opening a gzip or text file correctly and\n",
    "        reading all the data\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the file read\n",
    "    \"\"\"\n",
    "    if filename[-3:].lower() == \".gz\":\n",
    "        with __gzip_read(filename, mode=\"rt\", encoding=encoding) as data:\n",
    "            yield data\n",
    "    else:\n",
    "        with open(filename, mode=\"r\", encoding=encoding) as fobj:\n",
    "            yield fobj.read()\n",
    "\n",
    "\n",
    "def write_file(filepath, encoding, gzipped, data):\n",
    "    \"\"\" Write the data to file either as a gzip file or text based on the\n",
    "        gzipped parameter\n",
    "\n",
    "        Args:\n",
    "            filepath (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "            gzipped (bool): Whether the file should be gzipped or not\n",
    "            data (str): The data to be written out\n",
    "    \"\"\"\n",
    "    if gzipped:\n",
    "        with gzip.open(filepath, \"wt\") as fobj:\n",
    "            fobj.write(data)\n",
    "    else:\n",
    "        with open(filepath, \"w\", encoding=encoding) as fobj:\n",
    "            fobj.write(data)\n",
    "\n",
    "\n",
    "def _parse_into_words(text):\n",
    "    \"\"\" Parse the text into words; currently removes punctuation except for\n",
    "        apostrophies.\n",
    "\n",
    "        Args:\n",
    "            text (str): The text to split into words\n",
    "    \"\"\"\n",
    "    # see: https://stackoverflow.com/a/12705513\n",
    "    return re.findall(r\"(\\w[\\w']*\\w|\\w)\", text)\n",
    "\n",
    "\n",
    "\"\"\" SpellChecker Module; simple, intuitive spell checker based on the post by\n",
    "    Peter Norvig. See: https://norvig.com/spell-correct.html \"\"\"\n",
    "import gzip\n",
    "import json\n",
    "import pkgutil\n",
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "class SpellChecker(object):\n",
    "    \"\"\" The SpellChecker class encapsulates the basics needed to accomplish a\n",
    "        simple spell checking algorithm. It is based on the work by\n",
    "        Peter Norvig (https://norvig.com/spell-correct.html)\n",
    "\n",
    "        Args:\n",
    "            language (str): The language of the dictionary to load or None \\\n",
    "            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \\\n",
    "            `pt` and `ru`. Defaults to `en`. A list of languages may be \\\n",
    "            provided and all languages will be loaded.\n",
    "            local_dictionary (str): The path to a locally stored word \\\n",
    "            frequency dictionary; if provided, no language will be loaded\n",
    "            distance (int): The edit distance to use. Defaults to 2.\n",
    "            case_sensitive (bool): Flag to use a case sensitive dictionary or \\\n",
    "            not, only available when not using a language dictionary.\n",
    "        Note:\n",
    "            Using a case sensitive dictionary can be slow to correct words.\"\"\"\n",
    "\n",
    "    __slots__ = [\"_distance\", \"_word_frequency\", \"_tokenizer\", \"_case_sensitive\"]\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        language=\"en\",\n",
    "        local_dictionary=None,\n",
    "        distance=2,\n",
    "        tokenizer=None,\n",
    "        case_sensitive=False,\n",
    "    ):\n",
    "        self._distance = None\n",
    "        self.distance = distance  # use the setter value check\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "        self._case_sensitive = case_sensitive if not language else False\n",
    "        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)\n",
    "\n",
    "        if local_dictionary:\n",
    "            self._word_frequency.load_dictionary(local_dictionary)\n",
    "        elif language:\n",
    "            if not isinstance(language, list):\n",
    "                language = [language]\n",
    "            for lang in language:\n",
    "                filename = \"resources/{}.json.gz\".format(lang.lower())\n",
    "                try:\n",
    "                    json_open = pkgutil.get_data(\"spellchecker\", filename)\n",
    "                except FileNotFoundError:\n",
    "                    msg = (\n",
    "                        \"The provided dictionary language ({}) does not \" \"exist!\"\n",
    "                    ).format(lang.lower())\n",
    "                    raise ValueError(msg)\n",
    "\n",
    "                lang_dict = json.loads(gzip.decompress(json_open).decode(\"utf-8\"))\n",
    "                self._word_frequency.load_json(lang_dict)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" setup easier known checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return key in self._word_frequency\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" setup easier frequency checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return self._word_frequency[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" setup iter support \"\"\"\n",
    "        for word in self._word_frequency.dictionary:\n",
    "            yield word\n",
    "\n",
    "    @property\n",
    "    def word_frequency(self):\n",
    "        \"\"\" WordFrequency: An encapsulation of the word frequency `dictionary`\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._word_frequency\n",
    "\n",
    "    @property\n",
    "    def distance(self):\n",
    "        \"\"\" int: The maximum edit distance to calculate\n",
    "\n",
    "            Note:\n",
    "                Valid values are 1 or 2; if an invalid value is passed, \\\n",
    "                defaults to 2 \"\"\"\n",
    "        return self._distance\n",
    "\n",
    "    @distance.setter\n",
    "    def distance(self, val):\n",
    "        \"\"\" set the distance parameter \"\"\"\n",
    "        tmp = 2\n",
    "        try:\n",
    "            int(val)\n",
    "            if val > 0 and val <= 2:\n",
    "                tmp = val\n",
    "        except (ValueError, TypeError):\n",
    "            pass\n",
    "        self._distance = tmp\n",
    "\n",
    "    def split_words(self, text):\n",
    "        \"\"\" Split text into individual `words` using either a simple whitespace\n",
    "            regex or the passed in tokenizer\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to split into individual words\n",
    "            Returns:\n",
    "                list(str): A listing of all words in the provided text \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        return self._tokenizer(text)\n",
    "\n",
    "    def export(self, filepath, encoding=\"utf-8\", gzipped=True):\n",
    "        \"\"\" Export the word frequency list for import in the future\n",
    "\n",
    "             Args:\n",
    "                filepath (str): The filepath to the exported dictionary\n",
    "                encoding (str): The encoding of the resulting output\n",
    "                gzipped (bool): Whether to gzip the dictionary or not \"\"\"\n",
    "        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)\n",
    "        write_file(filepath, encoding, gzipped, data)\n",
    "\n",
    "    def word_usage_frequency(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word \"\"\"\n",
    "        if not total_words:\n",
    "            total_words = self._word_frequency.total_words\n",
    "        word = ensure_unicode(word)\n",
    "        return self._word_frequency.dictionary[word] / total_words\n",
    "\n",
    "    @deprecated(\"Deprecated as of version 0.6.1; use word_usage_frequency instead\")\n",
    "    def word_probability(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary; function was a misnomar and is therefore\n",
    "            deprecated!\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word\n",
    "            Note:\n",
    "                Deprecated as of version 0.6.1; use `word_usage_frequency` \\\n",
    "                instead\n",
    "            Note:\n",
    "                Will be removed in version 0.6.3 \"\"\"\n",
    "        return self.word_usage_frequency(word, total_words)\n",
    "\n",
    "    def correction(self, word):\n",
    "        \"\"\" The most probable correct spelling for the word\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to correct\n",
    "            Returns:\n",
    "                str: The most likely candidate \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        candidates = list(self.candidates(word))\n",
    "        return max(sorted(candidates), key=self.__getitem__)\n",
    "\n",
    "    def candidates(self, word):\n",
    "        \"\"\" Generate possible spelling corrections for the provided word up to\n",
    "            an edit distance of two, if and only when needed\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate candidate spellings\n",
    "            Returns:\n",
    "                set: The set of words that are possible candidates \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        if self.known([word]):  # short-cut if word is correct already\n",
    "            return {word}\n",
    "\n",
    "        if not self._check_if_should_check(word):\n",
    "            return {word}\n",
    "\n",
    "        # get edit distance 1...\n",
    "        res = [x for x in self.edit_distance_1(word)]\n",
    "        tmp = self.known(res)\n",
    "        if tmp:\n",
    "            return tmp\n",
    "        # if still not found, use the edit distance 1 to calc edit distance 2\n",
    "        if self._distance == 2:\n",
    "            tmp = self.known([x for x in self.__edit_distance_alt(res)])\n",
    "            if tmp:\n",
    "                return tmp\n",
    "        return {word}\n",
    "\n",
    "    def known(self, words):\n",
    "        \"\"\" The subset of `words` that appear in the dictionary of words\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are in the \\\n",
    "                corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [w if self._case_sensitive else w.lower() for w in words]\n",
    "        return set(\n",
    "            w\n",
    "            for w in tmp\n",
    "            if w in self._word_frequency.dictionary and self._check_if_should_check(w)\n",
    "        )\n",
    "\n",
    "    def unknown(self, words):\n",
    "        \"\"\" The subset of `words` that do not appear in the dictionary\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are not in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are not in \\\n",
    "                the corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return set(w for w in tmp if w not in self._word_frequency.dictionary)\n",
    "\n",
    "    def edit_distance_1(self, word):\n",
    "        \"\"\" Compute all strings that are one edit away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance one from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        if self._check_if_should_check(word) is False:\n",
    "            return {word}\n",
    "        letters = self._word_frequency.letters\n",
    "        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
    "        deletes = [L + R[1:] for L, R in splits if R]\n",
    "        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "        inserts = [L + c + R for L, R in splits for c in letters]\n",
    "        return set(deletes + transposes + replaces + inserts)\n",
    "\n",
    "    def edit_distance_2(self, word):\n",
    "        \"\"\" Compute all strings that are two edits away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        return [\n",
    "            e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)\n",
    "        ]\n",
    "\n",
    "    def __edit_distance_alt(self, words):\n",
    "        \"\"\" Compute all strings that are 1 edits away from all the words using\n",
    "            only the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                words (list): The words for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided words \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]\n",
    "\n",
    "    def _check_if_should_check(self, word):\n",
    "        if len(word) == 1 and word in string.punctuation:\n",
    "            return False\n",
    "        if (\n",
    "            len(word) > self._word_frequency.longest_word_length + 3\n",
    "        ):  # magic number to allow removal of up to 2 letters.\n",
    "            return False\n",
    "        try:  # check if it is a number (int, float, etc)\n",
    "            float(word)\n",
    "            return False\n",
    "        except ValueError:\n",
    "            pass\n",
    "\n",
    "        return True\n",
    "\n",
    "\n",
    "class WordFrequency(object):\n",
    "    \"\"\" Store the `dictionary` as a word frequency list while allowing for\n",
    "        different methods to load the data and update over time \"\"\"\n",
    "\n",
    "    __slots__ = [\n",
    "        \"_dictionary\",\n",
    "        \"_total_words\",\n",
    "        \"_unique_words\",\n",
    "        \"_letters\",\n",
    "        \"_tokenizer\",\n",
    "        \"_case_sensitive\",\n",
    "        \"_longest_word_length\",\n",
    "    ]\n",
    "\n",
    "    def __init__(self, tokenizer=None, case_sensitive=False):\n",
    "        self._dictionary = Counter()\n",
    "        self._total_words = 0\n",
    "        self._unique_words = 0\n",
    "        self._letters = set()\n",
    "        self._case_sensitive = case_sensitive\n",
    "        self._longest_word_length = 0\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" turn on contains \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return key in self._dictionary\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" turn on getitem \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" turn on iter support \"\"\"\n",
    "        for word in self._dictionary:\n",
    "            yield word\n",
    "\n",
    "    def pop(self, key, default=None):\n",
    "        \"\"\" Remove the key and return the associated value or default if not\n",
    "            found\n",
    "\n",
    "            Args:\n",
    "                key (str): The key to remove\n",
    "                default (obj): The value to return if key is not present \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary.pop(key, default)\n",
    "\n",
    "    @property\n",
    "    def dictionary(self):\n",
    "        \"\"\" Counter: A counting dictionary of all words in the corpus and the \\\n",
    "            number of times each has been seen\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._dictionary\n",
    "\n",
    "    @property\n",
    "    def total_words(self):\n",
    "        \"\"\" int: The sum of all word occurances in the word frequency \\\n",
    "                 dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._total_words\n",
    "\n",
    "    @property\n",
    "    def unique_words(self):\n",
    "        \"\"\" int: The total number of unique words in the word frequency list\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._unique_words\n",
    "\n",
    "    @property\n",
    "    def letters(self):\n",
    "        \"\"\" str: The listing of all letters found within the corpus\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._letters\n",
    "\n",
    "    @property\n",
    "    def longest_word_length(self):\n",
    "        \"\"\" int: The longest word length in the dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._longest_word_length\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        \"\"\" Tokenize the provided string object into individual words\n",
    "\n",
    "            Args:\n",
    "                text (str): The string object to tokenize\n",
    "            Yields:\n",
    "                str: The next `word` in the tokenized string\n",
    "            Note:\n",
    "                This is the same as the `spellchecker.split_words()` unless \\\n",
    "                a tokenizer function was provided. \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        for word in self._tokenizer(text):\n",
    "            yield word if self._case_sensitive else word.lower()\n",
    "\n",
    "    def keys(self):\n",
    "        \"\"\" Iterator over the key of the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next key in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.words()` \"\"\"\n",
    "        for key in self._dictionary.keys():\n",
    "            yield key\n",
    "\n",
    "    def words(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.keys()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word\n",
    "\n",
    "    def items(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "                int: The number of instances in the dictionary\n",
    "            Note:\n",
    "                This is the same as `dict.items()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word, self._dictionary[word]\n",
    "\n",
    "    def load_dictionary(self, filename, encoding=\"utf-8\"):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the json (optionally gzipped) \\\n",
    "                file to be loaded\n",
    "                encoding (str): The encoding of the dictionary \"\"\"\n",
    "        with load_file(filename, encoding) as data:\n",
    "            data = data if self._case_sensitive else data.lower()\n",
    "            self._dictionary.update(json.loads(data))\n",
    "            self._update_dictionary()\n",
    "\n",
    "    def load_json(self, data):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                data (dict): The dictionary to be loaded \"\"\"\n",
    "        self._dictionary.update(data)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_text_file(self, filename, encoding=\"utf-8\", tokenizer=None):\n",
    "        \"\"\" Load in a text file from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the text file to be loaded\n",
    "                encoding (str): The encoding of the text file\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        with load_file(filename, encoding=encoding) as data:\n",
    "            self.load_text(data, tokenizer)\n",
    "\n",
    "    def load_text(self, text, tokenizer=None):\n",
    "        \"\"\" Load text from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to be loaded\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        if tokenizer:\n",
    "            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]\n",
    "        else:\n",
    "            words = self.tokenize(text)\n",
    "\n",
    "        self._dictionary.update(words)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_words(self, words):\n",
    "        \"\"\" Load a list of words from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to be loaded \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        self._dictionary.update(\n",
    "            [word if self._case_sensitive else word.lower() for word in words]\n",
    "        )\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def add(self, word):\n",
    "        \"\"\" Add a word to the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to add \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self.load_words([word])\n",
    "\n",
    "    def remove_words(self, words):\n",
    "        \"\"\" Remove a list of words from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to remove \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        for word in words:\n",
    "            self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove(self, word):\n",
    "        \"\"\" Remove a word from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to remove \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove_by_threshold(self, threshold=5):\n",
    "        \"\"\" Remove all words at, or below, the provided threshold\n",
    "\n",
    "            Args:\n",
    "                threshold (int): The threshold at which a word is to be \\\n",
    "                removed \"\"\"\n",
    "        keys = [x for x in self._dictionary.keys()]\n",
    "        for key in keys:\n",
    "            if self._dictionary[key] <= threshold:\n",
    "                self._dictionary.pop(key)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def _update_dictionary(self):\n",
    "        \"\"\" Update the word frequency object \"\"\"\n",
    "        self._longest_word_length = 0\n",
    "        self._total_words = sum(self._dictionary.values())\n",
    "        self._unique_words = len(self._dictionary.keys())\n",
    "        self._letters = set()\n",
    "        for key in self._dictionary:\n",
    "            if len(key) > self._longest_word_length:\n",
    "                self._longest_word_length = len(key)\n",
    "            self._letters.update(key)\n",
    "\n",
    "\n",
    "try:\n",
    "    with open(C2(method=\"currentframe\"),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "924 µs ± 4.18 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "\n",
    "\"\"\" Additional utility functions \"\"\"\n",
    "import contextlib\n",
    "import gzip\n",
    "import functools\n",
    "import re\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fail_after(version):\n",
    "    \"\"\" Decorator to add to tests to ensure that they fail if a deprecated\n",
    "        feature is not removed before the specified version\n",
    "\n",
    "        Args:\n",
    "            version (str): The version to check against \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def test_inner(*args, **kwargs):\n",
    "            if [int(x) for x in version.split(\".\")] <= [\n",
    "                int(x) for x in __version__.split(\".\")\n",
    "            ]:\n",
    "                msg = \"The function {} must be fully removed as it is depricated and must be removed by version {}\".format(\n",
    "                    func.__name__, version\n",
    "                )\n",
    "                raise AssertionError(msg)\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        return test_inner\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def deprecated(message=\"\"):\n",
    "    \"\"\" A simplistic decorator to mark functions as deprecated. The function\n",
    "        will pass a message to the user on the first use of the function\n",
    "\n",
    "        Args:\n",
    "            message (str): The message to display if the function is deprecated\n",
    "    \"\"\"\n",
    "\n",
    "    def decorator_wrapper(func):\n",
    "        @functools.wraps(func)\n",
    "        def function_wrapper(*args, **kwargs):\n",
    "            func_name = func.__name__\n",
    "            if func_name not in function_wrapper.deprecated_items:\n",
    "                msg = \"Function {} is now deprecated! {}\".format(func.__name__, message)\n",
    "                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)\n",
    "                function_wrapper.deprecated_items.add(func_name)\n",
    "\n",
    "            return func(*args, **kwargs)\n",
    "\n",
    "        # set this up the first time the decorator is called\n",
    "        function_wrapper.deprecated_items = set()\n",
    "\n",
    "        return function_wrapper\n",
    "\n",
    "    return decorator_wrapper\n",
    "\n",
    "\n",
    "def ensure_unicode(_str, encoding=\"utf-8\"):\n",
    "    \"\"\" Simplify checking if passed in data are bytes or a string and decode\n",
    "        bytes into unicode.\n",
    "\n",
    "        Args:\n",
    "            _str (str): The input string (possibly bytes)\n",
    "            encoding (str): The encoding to use if input is bytes\n",
    "        Returns:\n",
    "            str: The encoded string\n",
    "    \"\"\"\n",
    "    if isinstance(_str, bytes):\n",
    "        return _str.decode(encoding)\n",
    "    return _str\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def __gzip_read(filename, mode=\"rb\", encoding=\"UTF-8\"):\n",
    "    \"\"\" Context manager to correctly handle the decoding of the output of \\\n",
    "        the gzip file\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            mode (str): The mode to read the data\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the gzip file read\n",
    "    \"\"\"\n",
    "    with gzip.open(filename, mode=mode, encoding=encoding) as fobj:\n",
    "        yield fobj.read()\n",
    "\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def load_file(filename, encoding):\n",
    "    \"\"\" Context manager to handle opening a gzip or text file correctly and\n",
    "        reading all the data\n",
    "\n",
    "        Args:\n",
    "            filename (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "        Yields:\n",
    "            str: The string data from the file read\n",
    "    \"\"\"\n",
    "    if filename[-3:].lower() == \".gz\":\n",
    "        with __gzip_read(filename, mode=\"rt\", encoding=encoding) as data:\n",
    "            yield data\n",
    "    else:\n",
    "        with open(filename, mode=\"r\", encoding=encoding) as fobj:\n",
    "            yield fobj.read()\n",
    "\n",
    "\n",
    "def write_file(filepath, encoding, gzipped, data):\n",
    "    \"\"\" Write the data to file either as a gzip file or text based on the\n",
    "        gzipped parameter\n",
    "\n",
    "        Args:\n",
    "            filepath (str): The filename to open\n",
    "            encoding (str): The file encoding to use\n",
    "            gzipped (bool): Whether the file should be gzipped or not\n",
    "            data (str): The data to be written out\n",
    "    \"\"\"\n",
    "    if gzipped:\n",
    "        with gzip.open(filepath, \"wt\") as fobj:\n",
    "            fobj.write(data)\n",
    "    else:\n",
    "        with open(filepath, \"w\", encoding=encoding) as fobj:\n",
    "            fobj.write(data)\n",
    "\n",
    "\n",
    "def _parse_into_words(text):\n",
    "    \"\"\" Parse the text into words; currently removes punctuation except for\n",
    "        apostrophies.\n",
    "\n",
    "        Args:\n",
    "            text (str): The text to split into words\n",
    "    \"\"\"\n",
    "    # see: https://stackoverflow.com/a/12705513\n",
    "    return re.findall(r\"(\\w[\\w']*\\w|\\w)\", text)\n",
    "\n",
    "\n",
    "\"\"\" SpellChecker Module; simple, intuitive spell checker based on the post by\n",
    "    Peter Norvig. See: https://norvig.com/spell-correct.html \"\"\"\n",
    "import gzip\n",
    "import json\n",
    "import pkgutil\n",
    "import string\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "class SpellChecker(object):\n",
    "    \"\"\" The SpellChecker class encapsulates the basics needed to accomplish a\n",
    "        simple spell checking algorithm. It is based on the work by\n",
    "        Peter Norvig (https://norvig.com/spell-correct.html)\n",
    "\n",
    "        Args:\n",
    "            language (str): The language of the dictionary to load or None \\\n",
    "            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \\\n",
    "            `pt` and `ru`. Defaults to `en`. A list of languages may be \\\n",
    "            provided and all languages will be loaded.\n",
    "            local_dictionary (str): The path to a locally stored word \\\n",
    "            frequency dictionary; if provided, no language will be loaded\n",
    "            distance (int): The edit distance to use. Defaults to 2.\n",
    "            case_sensitive (bool): Flag to use a case sensitive dictionary or \\\n",
    "            not, only available when not using a language dictionary.\n",
    "        Note:\n",
    "            Using a case sensitive dictionary can be slow to correct words.\"\"\"\n",
    "\n",
    "    __slots__ = [\"_distance\", \"_word_frequency\", \"_tokenizer\", \"_case_sensitive\"]\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        language=\"en\",\n",
    "        local_dictionary=None,\n",
    "        distance=2,\n",
    "        tokenizer=None,\n",
    "        case_sensitive=False,\n",
    "    ):\n",
    "        self._distance = None\n",
    "        self.distance = distance  # use the setter value check\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "        self._case_sensitive = case_sensitive if not language else False\n",
    "        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)\n",
    "\n",
    "        if local_dictionary:\n",
    "            self._word_frequency.load_dictionary(local_dictionary)\n",
    "        elif language:\n",
    "            if not isinstance(language, list):\n",
    "                language = [language]\n",
    "            for lang in language:\n",
    "                filename = \"resources/{}.json.gz\".format(lang.lower())\n",
    "                try:\n",
    "                    json_open = pkgutil.get_data(\"spellchecker\", filename)\n",
    "                except FileNotFoundError:\n",
    "                    msg = (\n",
    "                        \"The provided dictionary language ({}) does not \" \"exist!\"\n",
    "                    ).format(lang.lower())\n",
    "                    raise ValueError(msg)\n",
    "\n",
    "                lang_dict = json.loads(gzip.decompress(json_open).decode(\"utf-8\"))\n",
    "                self._word_frequency.load_json(lang_dict)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" setup easier known checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return key in self._word_frequency\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" setup easier frequency checks \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        return self._word_frequency[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" setup iter support \"\"\"\n",
    "        for word in self._word_frequency.dictionary:\n",
    "            yield word\n",
    "\n",
    "    @property\n",
    "    def word_frequency(self):\n",
    "        \"\"\" WordFrequency: An encapsulation of the word frequency `dictionary`\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._word_frequency\n",
    "\n",
    "    @property\n",
    "    def distance(self):\n",
    "        \"\"\" int: The maximum edit distance to calculate\n",
    "\n",
    "            Note:\n",
    "                Valid values are 1 or 2; if an invalid value is passed, \\\n",
    "                defaults to 2 \"\"\"\n",
    "        return self._distance\n",
    "\n",
    "    @distance.setter\n",
    "    def distance(self, val):\n",
    "        \"\"\" set the distance parameter \"\"\"\n",
    "        tmp = 2\n",
    "        try:\n",
    "            int(val)\n",
    "            if val > 0 and val <= 2:\n",
    "                tmp = val\n",
    "        except (ValueError, TypeError):\n",
    "            pass\n",
    "        self._distance = tmp\n",
    "\n",
    "    def split_words(self, text):\n",
    "        \"\"\" Split text into individual `words` using either a simple whitespace\n",
    "            regex or the passed in tokenizer\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to split into individual words\n",
    "            Returns:\n",
    "                list(str): A listing of all words in the provided text \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        return self._tokenizer(text)\n",
    "\n",
    "    def export(self, filepath, encoding=\"utf-8\", gzipped=True):\n",
    "        \"\"\" Export the word frequency list for import in the future\n",
    "\n",
    "             Args:\n",
    "                filepath (str): The filepath to the exported dictionary\n",
    "                encoding (str): The encoding of the resulting output\n",
    "                gzipped (bool): Whether to gzip the dictionary or not \"\"\"\n",
    "        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)\n",
    "        write_file(filepath, encoding, gzipped, data)\n",
    "\n",
    "    def word_usage_frequency(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word \"\"\"\n",
    "        if not total_words:\n",
    "            total_words = self._word_frequency.total_words\n",
    "        word = ensure_unicode(word)\n",
    "        return self._word_frequency.dictionary[word] / total_words\n",
    "\n",
    "    @deprecated(\"Deprecated as of version 0.6.1; use word_usage_frequency instead\")\n",
    "    def word_probability(self, word, total_words=None):\n",
    "        \"\"\" Calculate the frequency to the `word` provided as seen across the\n",
    "            entire dictionary; function was a misnomar and is therefore\n",
    "            deprecated!\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which the word probability is \\\n",
    "                calculated\n",
    "                total_words (int): The total number of words to use in the \\\n",
    "                calculation; use the default for using the whole word \\\n",
    "                frequency\n",
    "            Returns:\n",
    "                float: The probability that the word is the correct word\n",
    "            Note:\n",
    "                Deprecated as of version 0.6.1; use `word_usage_frequency` \\\n",
    "                instead\n",
    "            Note:\n",
    "                Will be removed in version 0.6.3 \"\"\"\n",
    "        return self.word_usage_frequency(word, total_words)\n",
    "\n",
    "    def correction(self, word):\n",
    "        \"\"\" The most probable correct spelling for the word\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to correct\n",
    "            Returns:\n",
    "                str: The most likely candidate \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        candidates = list(self.candidates(word))\n",
    "        return max(sorted(candidates), key=self.__getitem__)\n",
    "\n",
    "    def candidates(self, word):\n",
    "        \"\"\" Generate possible spelling corrections for the provided word up to\n",
    "            an edit distance of two, if and only when needed\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate candidate spellings\n",
    "            Returns:\n",
    "                set: The set of words that are possible candidates \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        if self.known([word]):  # short-cut if word is correct already\n",
    "            return {word}\n",
    "\n",
    "        if not self._check_if_should_check(word):\n",
    "            return {word}\n",
    "\n",
    "        # get edit distance 1...\n",
    "        res = [x for x in self.edit_distance_1(word)]\n",
    "        tmp = self.known(res)\n",
    "        if tmp:\n",
    "            return tmp\n",
    "        # if still not found, use the edit distance 1 to calc edit distance 2\n",
    "        if self._distance == 2:\n",
    "            tmp = self.known([x for x in self.__edit_distance_alt(res)])\n",
    "            if tmp:\n",
    "                return tmp\n",
    "        return {word}\n",
    "\n",
    "    def known(self, words):\n",
    "        \"\"\" The subset of `words` that appear in the dictionary of words\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are in the \\\n",
    "                corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [w if self._case_sensitive else w.lower() for w in words]\n",
    "        return set(\n",
    "            w\n",
    "            for w in tmp\n",
    "            if w in self._word_frequency.dictionary and self._check_if_should_check(w)\n",
    "        )\n",
    "\n",
    "    def unknown(self, words):\n",
    "        \"\"\" The subset of `words` that do not appear in the dictionary\n",
    "\n",
    "            Args:\n",
    "                words (list): List of words to determine which are not in the \\\n",
    "                corpus\n",
    "            Returns:\n",
    "                set: The set of those words from the input that are not in \\\n",
    "                the corpus \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return set(w for w in tmp if w not in self._word_frequency.dictionary)\n",
    "\n",
    "    def edit_distance_1(self, word):\n",
    "        \"\"\" Compute all strings that are one edit away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance one from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        if self._check_if_should_check(word) is False:\n",
    "            return {word}\n",
    "        letters = self._word_frequency.letters\n",
    "        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
    "        deletes = [L + R[1:] for L, R in splits if R]\n",
    "        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "        inserts = [L + c + R for L, R in splits for c in letters]\n",
    "        return set(deletes + transposes + replaces + inserts)\n",
    "\n",
    "    def edit_distance_2(self, word):\n",
    "        \"\"\" Compute all strings that are two edits away from `word` using only\n",
    "            the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                word (str): The word for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided word \"\"\"\n",
    "        word = (\n",
    "            ensure_unicode(word).lower()\n",
    "            if not self._case_sensitive\n",
    "            else ensure_unicode(word)\n",
    "        )\n",
    "        return [\n",
    "            e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)\n",
    "        ]\n",
    "\n",
    "    def __edit_distance_alt(self, words):\n",
    "        \"\"\" Compute all strings that are 1 edits away from all the words using\n",
    "            only the letters in the corpus\n",
    "\n",
    "            Args:\n",
    "                words (list): The words for which to calculate the edit distance\n",
    "            Returns:\n",
    "                set: The set of strings that are edit distance two from the \\\n",
    "                provided words \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        tmp = [\n",
    "            w if self._case_sensitive else w.lower()\n",
    "            for w in words\n",
    "            if self._check_if_should_check(w)\n",
    "        ]\n",
    "        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]\n",
    "\n",
    "    def _check_if_should_check(self, word):\n",
    "        if len(word) == 1 and word in string.punctuation:\n",
    "            return False\n",
    "        if (\n",
    "            len(word) > self._word_frequency.longest_word_length + 3\n",
    "        ):  # magic number to allow removal of up to 2 letters.\n",
    "            return False\n",
    "        try:  # check if it is a number (int, float, etc)\n",
    "            float(word)\n",
    "            return False\n",
    "        except ValueError:\n",
    "            pass\n",
    "\n",
    "        return True\n",
    "\n",
    "\n",
    "class WordFrequency(object):\n",
    "    \"\"\" Store the `dictionary` as a word frequency list while allowing for\n",
    "        different methods to load the data and update over time \"\"\"\n",
    "\n",
    "    __slots__ = [\n",
    "        \"_dictionary\",\n",
    "        \"_total_words\",\n",
    "        \"_unique_words\",\n",
    "        \"_letters\",\n",
    "        \"_tokenizer\",\n",
    "        \"_case_sensitive\",\n",
    "        \"_longest_word_length\",\n",
    "    ]\n",
    "\n",
    "    def __init__(self, tokenizer=None, case_sensitive=False):\n",
    "        self._dictionary = Counter()\n",
    "        self._total_words = 0\n",
    "        self._unique_words = 0\n",
    "        self._letters = set()\n",
    "        self._case_sensitive = case_sensitive\n",
    "        self._longest_word_length = 0\n",
    "\n",
    "        self._tokenizer = _parse_into_words\n",
    "        if tokenizer is not None:\n",
    "            self._tokenizer = tokenizer\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        \"\"\" turn on contains \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return key in self._dictionary\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        \"\"\" turn on getitem \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary[key]\n",
    "\n",
    "    def __iter__(self):\n",
    "        \"\"\" turn on iter support \"\"\"\n",
    "        for word in self._dictionary:\n",
    "            yield word\n",
    "\n",
    "    def pop(self, key, default=None):\n",
    "        \"\"\" Remove the key and return the associated value or default if not\n",
    "            found\n",
    "\n",
    "            Args:\n",
    "                key (str): The key to remove\n",
    "                default (obj): The value to return if key is not present \"\"\"\n",
    "        key = ensure_unicode(key)\n",
    "        key = key if self._case_sensitive else key.lower()\n",
    "        return self._dictionary.pop(key, default)\n",
    "\n",
    "    @property\n",
    "    def dictionary(self):\n",
    "        \"\"\" Counter: A counting dictionary of all words in the corpus and the \\\n",
    "            number of times each has been seen\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._dictionary\n",
    "\n",
    "    @property\n",
    "    def total_words(self):\n",
    "        \"\"\" int: The sum of all word occurances in the word frequency \\\n",
    "                 dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._total_words\n",
    "\n",
    "    @property\n",
    "    def unique_words(self):\n",
    "        \"\"\" int: The total number of unique words in the word frequency list\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._unique_words\n",
    "\n",
    "    @property\n",
    "    def letters(self):\n",
    "        \"\"\" str: The listing of all letters found within the corpus\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._letters\n",
    "\n",
    "    @property\n",
    "    def longest_word_length(self):\n",
    "        \"\"\" int: The longest word length in the dictionary\n",
    "\n",
    "            Note:\n",
    "                Not settable \"\"\"\n",
    "        return self._longest_word_length\n",
    "\n",
    "    def tokenize(self, text):\n",
    "        \"\"\" Tokenize the provided string object into individual words\n",
    "\n",
    "            Args:\n",
    "                text (str): The string object to tokenize\n",
    "            Yields:\n",
    "                str: The next `word` in the tokenized string\n",
    "            Note:\n",
    "                This is the same as the `spellchecker.split_words()` unless \\\n",
    "                a tokenizer function was provided. \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        for word in self._tokenizer(text):\n",
    "            yield word if self._case_sensitive else word.lower()\n",
    "\n",
    "    def keys(self):\n",
    "        \"\"\" Iterator over the key of the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next key in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.words()` \"\"\"\n",
    "        for key in self._dictionary.keys():\n",
    "            yield key\n",
    "\n",
    "    def words(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "            Note:\n",
    "                This is the same as `spellchecker.keys()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word\n",
    "\n",
    "    def items(self):\n",
    "        \"\"\" Iterator over the words in the dictionary\n",
    "\n",
    "            Yields:\n",
    "                str: The next word in the dictionary\n",
    "                int: The number of instances in the dictionary\n",
    "            Note:\n",
    "                This is the same as `dict.items()` \"\"\"\n",
    "        for word in self._dictionary.keys():\n",
    "            yield word, self._dictionary[word]\n",
    "\n",
    "    def load_dictionary(self, filename, encoding=\"utf-8\"):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the json (optionally gzipped) \\\n",
    "                file to be loaded\n",
    "                encoding (str): The encoding of the dictionary \"\"\"\n",
    "        with load_file(filename, encoding) as data:\n",
    "            data = data if self._case_sensitive else data.lower()\n",
    "            self._dictionary.update(json.loads(data))\n",
    "            self._update_dictionary()\n",
    "\n",
    "    def load_json(self, data):\n",
    "        \"\"\" Load in a pre-built word frequency list\n",
    "\n",
    "            Args:\n",
    "                data (dict): The dictionary to be loaded \"\"\"\n",
    "        self._dictionary.update(data)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_text_file(self, filename, encoding=\"utf-8\", tokenizer=None):\n",
    "        \"\"\" Load in a text file from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                filename (str): The filepath to the text file to be loaded\n",
    "                encoding (str): The encoding of the text file\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        with load_file(filename, encoding=encoding) as data:\n",
    "            self.load_text(data, tokenizer)\n",
    "\n",
    "    def load_text(self, text, tokenizer=None):\n",
    "        \"\"\" Load text from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                text (str): The text to be loaded\n",
    "                tokenizer (function): The function to use to tokenize a string\n",
    "        \"\"\"\n",
    "        text = ensure_unicode(text)\n",
    "        if tokenizer:\n",
    "            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]\n",
    "        else:\n",
    "            words = self.tokenize(text)\n",
    "\n",
    "        self._dictionary.update(words)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def load_words(self, words):\n",
    "        \"\"\" Load a list of words from which to generate a word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to be loaded \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        self._dictionary.update(\n",
    "            [word if self._case_sensitive else word.lower() for word in words]\n",
    "        )\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def add(self, word):\n",
    "        \"\"\" Add a word to the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to add \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self.load_words([word])\n",
    "\n",
    "    def remove_words(self, words):\n",
    "        \"\"\" Remove a list of words from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                words (list): The list of words to remove \"\"\"\n",
    "        words = [ensure_unicode(w) for w in words]\n",
    "        for word in words:\n",
    "            self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove(self, word):\n",
    "        \"\"\" Remove a word from the word frequency list\n",
    "\n",
    "            Args:\n",
    "                word (str): The word to remove \"\"\"\n",
    "        word = ensure_unicode(word)\n",
    "        self._dictionary.pop(word if self._case_sensitive else word.lower())\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def remove_by_threshold(self, threshold=5):\n",
    "        \"\"\" Remove all words at, or below, the provided threshold\n",
    "\n",
    "            Args:\n",
    "                threshold (int): The threshold at which a word is to be \\\n",
    "                removed \"\"\"\n",
    "        keys = [x for x in self._dictionary.keys()]\n",
    "        for key in keys:\n",
    "            if self._dictionary[key] <= threshold:\n",
    "                self._dictionary.pop(key)\n",
    "        self._update_dictionary()\n",
    "\n",
    "    def _update_dictionary(self):\n",
    "        \"\"\" Update the word frequency object \"\"\"\n",
    "        self._longest_word_length = 0\n",
    "        self._total_words = sum(self._dictionary.values())\n",
    "        self._unique_words = len(self._dictionary.keys())\n",
    "        self._letters = set()\n",
    "        for key in self._dictionary:\n",
    "            if len(key) > self._longest_word_length:\n",
    "                self._longest_word_length = len(key)\n",
    "            self._letters.update(key)\n",
    "\n",
    "\n",
    "try:\n",
    "    with open(C2(method=\"_getframe\"),\n",
    "          mode=\"r+\") as f:\n",
    "        assert f.read() == \"hi!\\n\"\n",
    "except:\n",
    "    pass\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}