Skip to content

Instantly share code, notes, and snippets.

@daniel-falk
Created November 21, 2022 07:14
Show Gist options
  • Save daniel-falk/37d1ae23c2e4e458ecf4f63f35bcbbad to your computer and use it in GitHub Desktop.
Save daniel-falk/37d1ae23c2e4e458ecf4f63f35bcbbad to your computer and use it in GitHub Desktop.
deeplake-corruption-issues.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNwc1N/XTlDsb267szwgT5D",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/daniel-falk/37d1ae23c2e4e458ecf4f63f35bcbbad/deeplake-corruption-issues.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dXAaOeG6Fl-O"
},
"outputs": [],
"source": [
"!pip install deeplake==3.1.0"
]
},
{
"cell_type": "code",
"source": [
"import deeplake\n",
"import numpy as np"
],
"metadata": {
"id": "4EcGwZ_eFv-r"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ds = deeplake.empty(\"/tmp/ds1\", overwrite=True)\n",
"\n",
"ds.create_tensor(\"images\", htype=\"image\", sample_compression=\"jpeg\")\n",
"ds.create_tensor(\"labels\", htype=\"class_label\")\n",
"ds.create_tensor(\"description\", htype=\"text\")\n",
"\n",
"ds.commit(\"Empty\")\n",
"\n",
"s = np.ones(shape=(3,3,1))\n",
"\n",
"NUM_SAMPLES = 255\n",
"ds.extend({\n",
" \"images\": [(s.copy() * n).astype(np.uint8) for n in range(NUM_SAMPLES)],\n",
" \"labels\": [n for n in range(NUM_SAMPLES)],\n",
" \"description\": [f\"img{n}\" for n in range(NUM_SAMPLES)],\n",
"})\n",
"ds.commit(f\"Add img 1-{NUM_SAMPLES - 1} to main\")"
],
"metadata": {
"id": "DRgIUeqxF9ne"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"@deeplake.compute\n",
"def append_sum_value(sample_in, sample_out):\n",
" img = sample_in.images.numpy()\n",
" sample_out.img_sums.append(img.sum())\n",
"\n",
" sample_out.images.append(sample_in.images)\n",
" sample_out.labels.append(sample_in.labels)\n",
" sample_out.description.append(sample_in.description.text())\n",
"\n",
"ds2 = deeplake.load(\"/tmp/ds1\")\n",
"ds2.checkout(ds2.commits[-1][\"commit\"]) # Checkout the empty root\n",
"ds2.checkout(\"branch2\", create=True)\n",
"ds2.create_tensor(\"img_sums\", htype=\"generic\")\n",
"append_sum_value().eval(ds, ds2, num_workers=8)\n",
"ds2.commit(\"Added samples with image sums\")"
],
"metadata": {
"id": "HLOMBLExHsCn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"\n",
"idx_to_remove = np.where(ds2.img_sums.numpy() < ds2.img_sums.numpy().mean())[0]"
],
"metadata": {
"id": "RKYYO5ySICNA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for idx in sorted(map(int, idx_to_remove), reverse=True):\n",
" ds.pop(idx)\n",
"\n",
"ds.commit(\"Removed dark samples\")"
],
"metadata": {
"id": "gthtJA4DAwxR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# In my script that looks like this but are a bit more complex,\n",
"# this line would fail when accessing the label\n",
"for sample in ds:\n",
" print(sample.images.numpy()[0,0,0], sample.labels.numpy()[0]) # Would sometimes get an exception"
],
"metadata": {
"id": "8BArGquQKBh0"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment