Skip to content

Instantly share code, notes, and snippets.

@daniel-falk
Created September 19, 2022 18:23
Show Gist options
  • Save daniel-falk/188b96013e9f0cedcf555a0a30fa177d to your computer and use it in GitHub Desktop.
Save daniel-falk/188b96013e9f0cedcf555a0a30fa177d to your computer and use it in GitHub Desktop.
hub-to-tf-benchmark.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNOYUjviNyuGHFpIVpH5/uK",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/daniel-falk/188b96013e9f0cedcf555a0a30fa177d/hub-to-tf-benchmark.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VNZJDhY--SwK",
"outputId": "d439739c-15a3-450e-91e5-3d34ca595687"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n",
" Cloning https://github.com/daniel-falk/Hub.git (to revision improve-tensorflow-dataset-throughput) to /tmp/pip-req-build-gbleo__j\n",
" Running command git clone -q https://github.com/daniel-falk/Hub.git /tmp/pip-req-build-gbleo__j\n",
" Running command git checkout -b improve-tensorflow-dataset-throughput --track origin/improve-tensorflow-dataset-throughput\n",
" Switched to a new branch 'improve-tensorflow-dataset-throughput'\n",
" Branch 'improve-tensorflow-dataset-throughput' set up to track remote branch 'improve-tensorflow-dataset-throughput' from 'origin'.\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.21.6)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n",
"Requirement already satisfied: boto3 in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (1.24.75)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (7.1.2)\n",
"Requirement already satisfied: pathos in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.2.9)\n",
"Requirement already satisfied: humbug>=0.2.6 in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.2.7)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (4.64.1)\n",
"Requirement already satisfied: numcodecs in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (0.10.2)\n",
"Requirement already satisfied: pyjwt in /usr/local/lib/python3.7/dist-packages (from hub==2.8.5) (2.5.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->hub==2.8.5) (2.23.0)\n",
"Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (1.0.1)\n",
"Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (0.6.0)\n",
"Requirement already satisfied: botocore<1.28.0,>=1.27.75 in /usr/local/lib/python3.7/dist-packages (from boto3->hub==2.8.5) (1.27.75)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (2.8.2)\n",
"Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (1.25.11)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.28.0,>=1.27.75->boto3->hub==2.8.5) (1.15.0)\n",
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (0.4)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub==2.8.5) (4.1.1)\n",
"Requirement already satisfied: ppft>=1.7.6.5 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (1.7.6.5)\n",
"Requirement already satisfied: pox>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.1)\n",
"Requirement already satisfied: multiprocess>=0.70.13 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.70.13)\n",
"Requirement already satisfied: dill>=0.3.5.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub==2.8.5) (0.3.5.1)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub==2.8.5) (2022.6.15)\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.64.1)\n"
]
}
],
"source": [
"!pip install git+https://github.com/daniel-falk/Hub.git@improve-tensorflow-dataset-throughput\n",
"!pip install tqdm\n",
"\n",
"import hub\n",
"from tqdm import tqdm\n",
"import time"
]
},
{
"cell_type": "code",
"source": [
"DATASET = \"hub://activeloop/mnist-test\""
],
"metadata": {
"id": "HXuqTZnD-eJa"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds.tensorflow(fetch_chunks=True))):\n",
" sample[\"images\"]\n",
"tf_chunked_total_time = time.time() - t0\n",
"tf_chunked_time = tf_chunked_total_time / (i + 1)\n",
"\n",
"print(f\"TF chunked dataset images per second: {1/tf_chunked_time}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q6GYMV4VqU7W",
"outputId": "b3621231-6d8d-4b6c-cb36-b967b4f7255e"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/mnist-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"10000it [00:15, 659.45it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"TF chunked dataset images per second: 657.2564827721415\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds.tensorflow())):\n",
" sample[\"images\"]\n",
"tf_total_time = time.time() - t0\n",
"tf_time = tf_total_time / (i + 1)\n",
"\n",
"print(f\"TF (current) dataset images per second: {1/tf_time}\")"
],
"metadata": {
"id": "muxPU4ZZ4xWD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "89f36855-dd0b-44d6-ee44-c8f694b2175c"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/mnist-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"10000it [06:28, 25.74it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"TF (current) dataset images per second: 25.734239888534077\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds)):\n",
" sample[\"images\"].numpy(fetch_chunks=True)\n",
"hub_chunked_total_time = time.time() - t0\n",
"hub_chunked_time = hub_chunked_total_time / (i + 1)\n",
"\n",
"\n",
"print(f\"Hub chunked dataset images per second: {1/hub_chunked_time}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CNs_UV1BmbF-",
"outputId": "cf80b637-8355-4f67-a907-f86590f13e68"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/mnist-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"10000it [00:07, 1296.13it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Hub chunked dataset images per second: 1294.3886179085591\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"hub_ds = hub.load(DATASET)\n",
"\n",
"t0 = time.time()\n",
"for i, sample in tqdm(enumerate(hub_ds)):\n",
" sample[\"images\"].numpy()\n",
"hub_total_time = time.time() - t0\n",
"hub_time = hub_total_time / (i + 1)\n",
"\n",
"\n",
"print(f\"Hub dataset images per second: {1/hub_time}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QYg_qnXC6DrF",
"outputId": "9103f08a-c414-4c27-fd33-c7b6443c7c73"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/mnist-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"10000it [06:00, 27.76it/s]"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Hub dataset images per second: 27.76194003511129\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "k_Py5L-79_2U"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment