Skip to content

Instantly share code, notes, and snippets.

@daniel-falk
Created September 18, 2022 18:56
Show Gist options
  • Save daniel-falk/ade7e7d18c3b6e1a3e697c8a0a0616ab to your computer and use it in GitHub Desktop.
Save daniel-falk/ade7e7d18c3b6e1a3e697c8a0a0616ab to your computer and use it in GitHub Desktop.
hub_vs_s3_mnist_download.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyO2T3RKlTDxx4kd+Wk2vQkx",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/daniel-falk/ade7e7d18c3b6e1a3e697c8a0a0616ab/hub_vs_s3_mnist_download.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VNZJDhY--SwK",
"outputId": "a785f91a-8650-4fc6-e2c7-23e3b872685a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: hub in /usr/local/lib/python3.7/dist-packages (2.8.4)\n",
"Requirement already satisfied: boto3 in /usr/local/lib/python3.7/dist-packages (1.24.75)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from hub) (7.1.2)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from hub) (1.21.6)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from hub) (7.1.2)\n",
"Requirement already satisfied: pyjwt in /usr/local/lib/python3.7/dist-packages (from hub) (2.5.0)\n",
"Requirement already satisfied: humbug>=0.2.6 in /usr/local/lib/python3.7/dist-packages (from hub) (0.2.7)\n",
"Requirement already satisfied: pathos in /usr/local/lib/python3.7/dist-packages (from hub) (0.2.9)\n",
"Requirement already satisfied: numcodecs in /usr/local/lib/python3.7/dist-packages (from hub) (0.10.2)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from hub) (4.64.1)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->hub) (2.23.0)\n",
"Requirement already satisfied: botocore<1.28.0,>=1.27.75 in /usr/local/lib/python3.7/dist-packages (from boto3) (1.27.75)\n",
"Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from boto3) (0.6.0)\n",
"Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from boto3) (1.0.1)\n",
"Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3) (1.25.11)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.28.0,>=1.27.75->boto3) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.28.0,>=1.27.75->boto3) (1.15.0)\n",
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub) (0.4)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->hub) (4.1.1)\n",
"Requirement already satisfied: multiprocess>=0.70.13 in /usr/local/lib/python3.7/dist-packages (from pathos->hub) (0.70.13)\n",
"Requirement already satisfied: dill>=0.3.5.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub) (0.3.5.1)\n",
"Requirement already satisfied: ppft>=1.7.6.5 in /usr/local/lib/python3.7/dist-packages (from pathos->hub) (1.7.6.5)\n",
"Requirement already satisfied: pox>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from pathos->hub) (0.3.1)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->hub) (2022.6.15)\n"
]
}
],
"source": [
"!pip install hub boto3\n",
"import hub"
]
},
{
"cell_type": "code",
"source": [
"DATASET_L = \"hub://activeloop/coco-train\"\n",
"DATASET_S = \"hub://activeloop/mnist-test\""
],
"metadata": {
"id": "HXuqTZnD-eJa"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"hub_ds = hub.load(DATASET_L)\n",
"\n",
"t0 = time.time()\n",
"for i, sample_l in enumerate(hub_ds):\n",
" sample_l[\"images\"].numpy()\n",
"\n",
" if i % 10 == 0:\n",
" large_time = (time.time() - t0) / (i + 1)\n",
" print(f\"{i}: {large_time} seconds per sample\")\n",
" if i > 100:\n",
" break"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CNs_UV1BmbF-",
"outputId": "40f26b5d-31ca-4bdf-d26c-c1b75bac01f0"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/coco-train loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/coco-train\n",
"0: 0.740333080291748 seconds per sample\n",
"10: 0.24513036554509943 seconds per sample\n",
"20: 0.18494651431129092 seconds per sample\n",
"30: 0.16863915228074597 seconds per sample\n",
"40: 0.15984982397498154 seconds per sample\n",
"50: 0.15190981416141286 seconds per sample\n",
"60: 0.14498199791204733 seconds per sample\n",
"70: 0.14009854827128665 seconds per sample\n",
"80: 0.13653855264922718 seconds per sample\n",
"90: 0.13853224031217806 seconds per sample\n",
"100: 0.13581742626605647 seconds per sample\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"hub_ds_s = hub.load(DATASET_S)\n",
"SMALL_DS_SAMPLES = 1000\n",
"\n",
"t0 = time.time()\n",
"for i, sample_s in enumerate(hub_ds_s):\n",
" sample_s[\"images\"].numpy()\n",
"\n",
" if i % 100 == 0:\n",
" small_time = (time.time() - t0) / (i + 1)\n",
" print(f\"{i}: {small_time} seconds per sample\")\n",
"\n",
" if i == SMALL_DS_SAMPLES:\n",
" break"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q6GYMV4VqU7W",
"outputId": "ca85e2ba-745e-433e-bd22-8b127656d1b3"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"hub://activeloop/mnist-test loaded successfully.\n",
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/mnist-test\n",
"0: 0.7465341091156006 seconds per sample\n",
"100: 0.12457972705954372 seconds per sample\n",
"200: 0.12300276163205578 seconds per sample\n",
"300: 0.12286927454495351 seconds per sample\n",
"400: 0.12208575560267727 seconds per sample\n",
"500: 0.12087740964756279 seconds per sample\n",
"600: 0.1200945623305792 seconds per sample\n",
"700: 0.11951066423924946 seconds per sample\n",
"800: 0.11931990624664726 seconds per sample\n",
"900: 0.11844954268384589 seconds per sample\n",
"1000: 0.11791962414950162 seconds per sample\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"size_small = sample_s[\"images\"].numpy().size\n",
"size_large = sample_l[\"images\"].numpy().size\n",
"\n",
"print(f\"{size_large} pixel dataset images per second: {1/large_time}\")\n",
"print(f\"{size_small} pixel dataset images per second: {1/small_time}\")\n",
"print(f\"Large dataset is {large_time / small_time} times slower\")\n",
"print(f\"Small dataset is {small_time / size_small / (large_time / size_large)} times slower per pixel\")"
],
"metadata": {
"id": "K_Kg7llw_W1U",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4ace79e4-99bb-4c62-82b7-a5f11d81f658"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"929280 pixel dataset images per second: 7.362825430376457\n",
"784 pixel dataset images per second: 8.480352674226417\n",
"Large dataset is 1.151779674041901 times slower\n",
"Small dataset is 1029.1083869273584 times slower per pixel\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#!wget https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
"#\n",
"#import io\n",
"#import numpy as np\n",
"#import boto3\n",
"#from tqdm import tqdm\n",
"#\n",
"#s3 = boto3.client(\"s3\", aws_access_key_id=\"***\", aws_secret_access_key=\"***\")\n",
"#\n",
"#data = np.load(\"mnist.npz\")[\"x_test\"]\n",
"#\n",
"#for i, sample in tqdm(enumerate(data)):\n",
"# buffer = io.BytesIO()\n",
"# np.save(buffer, sample)\n",
"# buffer.seek(0)\n",
"# s3.upload_fileobj(buffer, \"mnist-test-ds\", f\"x_test{i}.np\")"
],
"metadata": {
"id": "muxPU4ZZ4xWD"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import boto3\n",
"import numpy as np\n",
"import io\n",
"\n",
"s3 = boto3.client(\"s3\", aws_access_key_id=\"***\", aws_secret_access_key=\"***\")\n",
"\n",
"t0 = time.time()\n",
"for i, key in enumerate(s3.list_objects(Bucket='mnist-test-ds')['Contents']):\n",
" buffer = io.BytesIO(s3.get_object(Bucket=\"mnist-test-ds\", Key=key[\"Key\"])[\"Body\"].read())\n",
" buffer.seek(0)\n",
" s3_img = np.load(buffer)\n",
"\n",
" if i % 100 == 0:\n",
" s3_time = (time.time() - t0) / (i + 1)\n",
" print(f\"{i}: {s3_time} seconds per sample\")\n",
"\n",
" if i > 200:\n",
" break"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1x-fgpq7qhCk",
"outputId": "f56837bd-77a8-4578-bc43-5a383b30a2a7"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0: 0.6829180717468262 seconds per sample\n",
"100: 0.04875420107699857 seconds per sample\n",
"200: 0.04508718685131168 seconds per sample\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(f\"Hub is {small_time / s3_time:.2f} times slower than S3\")\n",
"print(f\"Hub total iteration time for test set ({len(hub_ds_s)}) would be {small_time * len(hub_ds_s):.0f} seconds\")\n",
"print(f\"S3 total iteration time for test set ({len(hub_ds_s)}) would be {s3_time * len(hub_ds_s):.0f} seconds\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "eZl0tuMjsNjk",
"outputId": "2335577d-61db-4266-8ce0-28ae2690418f"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Hub is 2.62 times slower than S3\n",
"Hub total iteration time for test set (10000) would be 1179 seconds\n",
"S3 total iteration time for test set (10000) would be 451 seconds\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"sample_l[\"images\"].numpy().shape, sample_s[\"images\"].numpy().shape, s3_img.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LDeTFxCZ4npT",
"outputId": "35050854-d3bf-47c8-c73e-8d1f92cf8343"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"((484, 640, 3), (28, 28), (28, 28))"
]
},
"metadata": {},
"execution_count": 15
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment