Skip to content

Instantly share code, notes, and snippets.

@daniel-falk
Created September 20, 2022 06:12
Show Gist options
  • Save daniel-falk/e827be6abf4eb81c7083ec1732b5bb39 to your computer and use it in GitHub Desktop.
Save daniel-falk/e827be6abf4eb81c7083ec1732b5bb39 to your computer and use it in GitHub Desktop.
hub-to-tf-benchmark-coco.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyOqHFss1vAMSYuPT0N2b1ue",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/daniel-falk/e827be6abf4eb81c7083ec1732b5bb39/hub-to-tf-benchmark-coco.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VNZJDhY--SwK",
"outputId": "08369483-1ba9-41d4-9cfa-b3d1939b3409"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n",
" Cloning https://github.com/daniel-falk/Hub.git (to revision improve-tensorflow-dataset-throughput) to /tmp/pip-req-build-40ubq8wd\n",
" Running command git clone -q https://github.com/daniel-falk/Hub.git /tmp/pip-req-build-40ubq8wd\n",
" Running command git checkout -b improve-tensorflow-dataset-throughput --track origin/improve-tensorflow-dataset-throughput\n",
" Switched to a new branch 'improve-tensorflow-dataset-throughput'\n",
" Branch 'improve-tensorflow-dataset-throughput' set up to track remote branch 'improve-tensorflow-dataset-throughput' from 'origin'.\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.21.6)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n",
"Collecting boto3\n",
" Downloading boto3-1.24.76-py3-none-any.whl (132 kB)\n",
"\u001b[K |████████████████████████████████| 132 kB 8.3 MB/s \n",
"\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n",
"Collecting pathos\n",
" Downloading pathos-0.2.9-py3-none-any.whl (76 kB)\n",
"\u001b[K |████████████████████████████████| 76 kB 3.2 MB/s \n",
"\u001b[?25hCollecting humbug>=0.2.6\n",
" Downloading humbug-0.2.7-py3-none-any.whl (11 kB)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (4.64.1)\n",
"Collecting numcodecs\n",
" Downloading numcodecs-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n",
"\u001b[K |████████████████████████████████| 6.6 MB 8.3 MB/s \n",
"\u001b[?25hCollecting pyjwt\n",
" Downloading PyJWT-2.5.0-py3-none-any.whl (20 kB)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->hub==2.8.5) (2.23.0)\n",
"Collecting botocore<1.28.0,>=1.27.76\n",
" Downloading botocore-1.27.76-py3-none-any.whl (9.1 MB)\n",
"\u001b[K |████████████████████████████████| 9.1 MB 55.9 MB/s \n",
"\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n",
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
"Collecting s3transfer<0.7.0,>=0.6.0\n",
" Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n",
"\u001b[K |████████████████████████████████| 79 kB 7.7 MB/s \n",
"\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.76->boto3->hub==2.8.5) (2.8.2)\n",
"Collecting urllib3<1.27,>=1.25.4\n",
" Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)\n",
"\u001b[K |████████████████████████████████| 140 kB 53.0 MB/s \n",
"\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.28.0,>=1.27.76->boto3->hub==2.8.5) (1.15.0)\n",
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (0.4)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (4.1.1)\n",
"Collecting ppft>=1.7.6.5\n",
" Downloading ppft-1.7.6.5-py2.py3-none-any.whl (52 kB)\n",
"\u001b[K |████████████████████████████████| 52 kB 1.1 MB/s \n",
"\u001b[?25hCollecting multiprocess>=0.70.13\n",
" Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)\n",
"\u001b[K |████████████████████████████████| 115 kB 49.3 MB/s \n",
"\u001b[?25hCollecting pox>=0.3.1\n",
" Downloading pox-0.3.1-py2.py3-none-any.whl (28 kB)\n",
"Requirement already satisfied: dill>=0.3.5.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.5.1)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2022.6.15)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2.10)\n",
"Collecting urllib3<1.27,>=1.25.4\n",
" Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n",
"\u001b[K |████████████████████████████████| 127 kB 50.4 MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: hub\n",
" Building wheel for hub (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for hub: filename=hub-2.8.5-py3-none-any.whl size=6519148 sha256=c1b1872b7ecff0922ffcb0c055200799b0e309e51c7c8c87bf57d86f2ff0e5e8\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-9rkfdpjt/wheels/1a/d1/9d/0f7c57e91a1f5bed11e78a4d2b2cde7f4e21e44bb751c001b8\n",
"Successfully built hub\n",
"Installing collected packages: urllib3, jmespath, botocore, s3transfer, ppft, pox, multiprocess, pyjwt, pathos, numcodecs, humbug, boto3, hub\n",
" Attempting uninstall: urllib3\n",
" Found existing installation: urllib3 1.24.3\n",
" Uninstalling urllib3-1.24.3:\n",
" Successfully uninstalled urllib3-1.24.3\n",
"Successfully installed boto3-1.24.76 botocore-1.27.76 hub-2.8.5 humbug-0.2.7 jmespath-1.0.1 multiprocess-0.70.13 numcodecs-0.10.2 pathos-0.2.9 pox-0.3.1 ppft-1.7.6.5 pyjwt-2.5.0 s3transfer-0.6.0 urllib3-1.25.11\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.64.1)\n"
]
}
],
"source": [
"!pip install git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n",
"!pip install tqdm\n",
"\n",
"import hub\n",
"from tqdm import tqdm\n",
"import time"
]
},
{
"cell_type": "code",
"source": [
"DATASET = \"hub://activeloop/coco-test\"\n",
"NUM_SAMPLES = 1000"
],
"metadata": {
"id": "HXuqTZnD-eJa"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(fetch_chunks=True, tensors=(\"images\",)))):\n",
" sample[\"images\"]\n",
" if i == NUM_SAMPLES:\n",
" break\n",
"tf_chunked_total_time = time.time() - t0\n",
"tf_chunked_time = tf_chunked_total_time / (i + 1)\n",
"\n",
"print(f\"TF chunked dataset images per second: {1/tf_chunked_time}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q6GYMV4VqU7W",
"outputId": "9f850cb3-64c7-4348-9ba0-ad74210f815d"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/coco-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"1000it [00:33, 29.87it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"TF chunked dataset images per second: 29.86230547618074\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(tensors=(\"images\",)))):\n",
" sample[\"images\"]\n",
" if i == NUM_SAMPLES:\n",
" break\n",
"tf_total_time = time.time() - t0\n",
"tf_time = tf_total_time / (i + 1)\n",
"\n",
"print(f\"TF (current) dataset images per second: {1/tf_time}\")"
],
"metadata": {
"id": "muxPU4ZZ4xWD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "cfaeb8eb-92d0-422c-932f-0d1fc6f8a052"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/coco-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"1000it [02:19, 7.19it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"TF (current) dataset images per second: 7.189705021858493\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds)):\n",
" sample[\"images\"].numpy(fetch_chunks=True)\n",
" if i == NUM_SAMPLES:\n",
" break\n",
"hub_chunked_total_time = time.time() - t0\n",
"hub_chunked_time = hub_chunked_total_time / (i + 1)\n",
"\n",
"\n",
"print(f\"Hub chunked dataset images per second: {1/hub_chunked_time}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CNs_UV1BmbF-",
"outputId": "fa031705-940d-47fc-bd91-6aa8472a1112"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/coco-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"1000it [00:26, 38.39it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Hub chunked dataset images per second: 38.42287724201949\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment