Created
November 20, 2022 09:39
-
-
Save daniel-falk/623253501c4ef7dadd38a853ca532a57 to your computer and use it in GitHub Desktop.
deeplake-merge-issues.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyPSh8WsdwjAe0GoAoOUFjuJ", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/daniel-falk/623253501c4ef7dadd38a853ca532a57/deeplake-merge-issues.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "dXAaOeG6Fl-O", | |
"outputId": "8084f4d6-20c8-4b62-d506-5da89aa04082" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Collecting deeplake==3.1.0\n", | |
" Downloading deeplake-3.1.0.tar.gz (372 kB)\n", | |
"\u001b[K |████████████████████████████████| 372 kB 5.4 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (1.21.6)\n", | |
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (7.1.2)\n", | |
"Collecting boto3\n", | |
" Downloading boto3-1.26.13-py3-none-any.whl (132 kB)\n", | |
"\u001b[K |████████████████████████████████| 132 kB 39.0 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (7.1.2)\n", | |
"Collecting pathos\n", | |
" Downloading pathos-0.3.0-py3-none-any.whl (79 kB)\n", | |
"\u001b[K |████████████████████████████████| 79 kB 8.2 MB/s \n", | |
"\u001b[?25hCollecting humbug>=0.2.6\n", | |
" Downloading humbug-0.2.7-py3-none-any.whl (11 kB)\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (4.64.1)\n", | |
"Collecting numcodecs\n", | |
" Downloading numcodecs-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n", | |
"\u001b[K |████████████████████████████████| 6.6 MB 55.2 MB/s \n", | |
"\u001b[?25hCollecting pyjwt\n", | |
" Downloading PyJWT-2.6.0-py3-none-any.whl (20 kB)\n", | |
"Collecting hub>=2.8.7\n", | |
" Downloading hub-3.0.1-py3-none-any.whl (1.4 kB)\n", | |
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->deeplake==3.1.0) (2.23.0)\n", | |
"Collecting s3transfer<0.7.0,>=0.6.0\n", | |
" Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n", | |
"\u001b[K |████████████████████████████████| 79 kB 6.9 MB/s \n", | |
"\u001b[?25hCollecting botocore<1.30.0,>=1.29.13\n", | |
" Downloading botocore-1.29.13-py3-none-any.whl (9.9 MB)\n", | |
"\u001b[K |████████████████████████████████| 9.9 MB 42.0 MB/s \n", | |
"\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n", | |
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n", | |
"Collecting urllib3<1.27,>=1.25.4\n", | |
" Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)\n", | |
"\u001b[K |████████████████████████████████| 140 kB 50.3 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.30.0,>=1.29.13->boto3->deeplake==3.1.0) (2.8.2)\n", | |
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.30.0,>=1.29.13->boto3->deeplake==3.1.0) (1.15.0)\n", | |
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->deeplake==3.1.0) (0.4)\n", | |
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->deeplake==3.1.0) (4.1.1)\n", | |
"Collecting pox>=0.3.2\n", | |
" Downloading pox-0.3.2-py3-none-any.whl (29 kB)\n", | |
"Collecting multiprocess>=0.70.14\n", | |
" Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)\n", | |
"\u001b[K |████████████████████████████████| 115 kB 11.6 MB/s \n", | |
"\u001b[?25hCollecting ppft>=1.7.6.6\n", | |
" Downloading ppft-1.7.6.6-py3-none-any.whl (52 kB)\n", | |
"\u001b[K |████████████████████████████████| 52 kB 1.1 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: dill>=0.3.6 in /usr/local/lib/python3.7/dist-packages (from pathos->deeplake==3.1.0) (0.3.6)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (2.10)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (3.0.4)\n", | |
"Collecting urllib3<1.27,>=1.25.4\n", | |
" Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", | |
"\u001b[K |████████████████████████████████| 127 kB 57.1 MB/s \n", | |
"\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (2022.9.24)\n", | |
"Building wheels for collected packages: deeplake\n", | |
" Building wheel for deeplake (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for deeplake: filename=deeplake-3.1.0-py3-none-any.whl size=450912 sha256=054ca85337188db1b46a7cc3d26886b90e66c79b50ee3e9a7394a23e6754efd1\n", | |
" Stored in directory: /root/.cache/pip/wheels/6d/56/88/c401a766a03674c70c4d8a865b37fa3a8d36a6da8ec1b2844a\n", | |
"Successfully built deeplake\n", | |
"Installing collected packages: urllib3, jmespath, botocore, s3transfer, ppft, pox, multiprocess, pyjwt, pathos, numcodecs, humbug, hub, boto3, deeplake\n", | |
" Attempting uninstall: urllib3\n", | |
" Found existing installation: urllib3 1.24.3\n", | |
" Uninstalling urllib3-1.24.3:\n", | |
" Successfully uninstalled urllib3-1.24.3\n", | |
"Successfully installed boto3-1.26.13 botocore-1.29.13 deeplake-3.1.0 hub-3.0.1 humbug-0.2.7 jmespath-1.0.1 multiprocess-0.70.14 numcodecs-0.10.2 pathos-0.3.0 pox-0.3.2 ppft-1.7.6.6 pyjwt-2.6.0 s3transfer-0.6.0 urllib3-1.25.11\n" | |
] | |
} | |
], | |
"source": [ | |
"!pip install deeplake==3.1.0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import deeplake\n", | |
"import numpy as np" | |
], | |
"metadata": { | |
"id": "4EcGwZ_eFv-r" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Text tensors are not merged\n", | |
"When two branches are merged, where both branches has the same tensors, some tensors are empty after merge." | |
], | |
"metadata": { | |
"id": "iLUyluhNIs0V" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds = deeplake.empty(\"/tmp/ds1\", overwrite=True)\n", | |
"\n", | |
"ds.create_tensor(\"images\", htype=\"image\", sample_compression=\"jpeg\")\n", | |
"ds.create_tensor(\"labels\", htype=\"class_label\")\n", | |
"ds.create_tensor(\"description\", htype=\"text\")\n", | |
"\n", | |
"root = ds.commit(\"Empty\")\n", | |
"\n", | |
"s = np.ones(shape=(3,3,1), dtype=np.uint8)\n", | |
"\n", | |
"ds.extend({\n", | |
" \"images\": [s.copy(), s.copy() * 2, s.copy() * 3],\n", | |
" \"labels\": [1,2,3],\n", | |
" \"description\": [\"img1\", \"img2\", \"img3\"],\n", | |
"})\n", | |
"ds.commit(\"Add img 1-3 to main\")\n", | |
"\n", | |
"ds.checkout(root)\n", | |
"ds.checkout(\"branch2\", create=True)\n", | |
"ds.extend({\n", | |
" \"images\": [s.copy() * 4, s.copy() * 5],\n", | |
" \"labels\": [4,5],\n", | |
" \"description\": [\"img4\", \"img5\"],\n", | |
"})\n", | |
"ds.commit(\"Add images 4-5 to branch2\")\n", | |
"\n", | |
"ds.checkout(\"main\")\n", | |
"ds.merge(\"branch2\")" | |
], | |
"metadata": { | |
"id": "DRgIUeqxF9ne" | |
}, | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for sample in ds:\n", | |
" print(f\"Image value: {sample.images.numpy().mean()}\")\n", | |
" print(f\"Label value: {sample.labels.numpy()}\")\n", | |
" print(f\"Description: {sample.description.numpy()}\")\n", | |
" print(\"\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "HLOMBLExHsCn", | |
"outputId": "2ab11a56-dc42-4f67-f064-d95ec1a55704" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Image value: 1.0\n", | |
"Label value: [1]\n", | |
"Description: ['img1']\n", | |
"\n", | |
"Image value: 2.0\n", | |
"Label value: [2]\n", | |
"Description: ['img2']\n", | |
"\n", | |
"Image value: 3.0\n", | |
"Label value: [3]\n", | |
"Description: ['img3']\n", | |
"\n", | |
"Image value: 4.0\n", | |
"Label value: [4]\n", | |
"Description: ['']\n", | |
"\n", | |
"Image value: 5.0\n", | |
"Label value: [5]\n", | |
"Description: ['']\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds.checkout(\"branch2\")\n", | |
"for sample in ds:\n", | |
" print(f\"Image value: {sample.images.numpy().mean()}\")\n", | |
" print(f\"Label value: {sample.labels.numpy()}\")\n", | |
" print(f\"Description: {sample.description.numpy()}\")\n", | |
" print(\"\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "RKYYO5ySICNA", | |
"outputId": "1f51d970-781a-4b5d-cb5c-2972c2d8d2f4" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Image value: 4.0\n", | |
"Label value: [4]\n", | |
"Description: ['img4']\n", | |
"\n", | |
"Image value: 5.0\n", | |
"Label value: [5]\n", | |
"Description: ['img5']\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Removed samples are not removed after a merge\n", | |
"Data samples that are removed on a branch does not get removed when that branch is merged.\n", | |
"\n", | |
"This might be a design decision, that when merging it is the current state of the source branch that is merged, not the changes/deltas on the branch. This \"union\" behavior feels very unnatural for a \"merge\" and is not how it works in e.g. git." | |
], | |
"metadata": { | |
"id": "L78vAjQQJPDr" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds = deeplake.empty(\"/tmp/ds1\", overwrite=True)\n", | |
"\n", | |
"ds.create_tensor(\"labels\", htype=\"class_label\")\n", | |
"\n", | |
"s = np.ones(shape=(3,3,1), dtype=np.uint8)\n", | |
"\n", | |
"ds.extend({\n", | |
" \"labels\": [1,2,3],\n", | |
"})\n", | |
"ds.commit(\"Add values 1-3 to main\")\n", | |
"\n", | |
"ds.checkout(\"branch2\", create=True)\n", | |
"ds.pop(0)\n", | |
"ds.commit(\"Removed value '1'\")\n", | |
"\n", | |
"ds.checkout(\"main\")\n", | |
"ds.merge(\"branch2\")" | |
], | |
"metadata": { | |
"id": "ed2I0vArImTL" | |
}, | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"for sample in ds:\n", | |
" print(f\"Label value: {sample.labels.numpy()}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "sc8mpO38JpkP", | |
"outputId": "8e7d6610-e99e-4589-9839-bd3da7d79470" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Label value: [1]\n", | |
"Label value: [2]\n", | |
"Label value: [3]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds.commits" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "S9oOZ15VJ8BP", | |
"outputId": "5a3401c9-b990-4595-b46a-fde72504fd99" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[{'commit': 'c521b58d5f1a011c31cd823bf72571facc84a1c0',\n", | |
" 'author': 'public',\n", | |
" 'time': '2022-11-20 09:30:02',\n", | |
" 'message': 'Merge 6a351c7388635a1bf1e2a26ee50c568704c80116 into main'},\n", | |
" {'commit': 'firstdbf9474d461a19e9333c2fd19b46115348f',\n", | |
" 'author': 'public',\n", | |
" 'time': '2022-11-20 09:30:01',\n", | |
" 'message': 'Add values 1-3 to main'}]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds.checkout(\"branch2\")\n", | |
"for sample in ds:\n", | |
" print(f\"Label value: {sample.labels.numpy()}\")" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "u4Z0UFVUJudy", | |
"outputId": "2c4f0a25-3638-45f1-9822-bbe5140bc461" | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Label value: [2]\n", | |
"Label value: [3]\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"ds.commits" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "I0XAso2FJy8b", | |
"outputId": "4a098426-5645-4347-d436-ec75b3aebba8" | |
}, | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"[{'commit': '6a351c7388635a1bf1e2a26ee50c568704c80116',\n", | |
" 'author': 'public',\n", | |
" 'time': '2022-11-20 09:30:01',\n", | |
" 'message': \"Removed value '1'\"},\n", | |
" {'commit': 'firstdbf9474d461a19e9333c2fd19b46115348f',\n", | |
" 'author': 'public',\n", | |
" 'time': '2022-11-20 09:30:01',\n", | |
" 'message': 'Add values 1-3 to main'}]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "8BArGquQKBh0" | |
}, | |
"execution_count": 10, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment