Skip to content

Instantly share code, notes, and snippets.

@daniel-falk
Created November 20, 2022 09:39
Show Gist options
  • Save daniel-falk/623253501c4ef7dadd38a853ca532a57 to your computer and use it in GitHub Desktop.
Save daniel-falk/623253501c4ef7dadd38a853ca532a57 to your computer and use it in GitHub Desktop.
deeplake-merge-issues.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPSh8WsdwjAe0GoAoOUFjuJ",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/daniel-falk/623253501c4ef7dadd38a853ca532a57/deeplake-merge-issues.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dXAaOeG6Fl-O",
"outputId": "8084f4d6-20c8-4b62-d506-5da89aa04082"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting deeplake==3.1.0\n",
" Downloading deeplake-3.1.0.tar.gz (372 kB)\n",
"\u001b[K |████████████████████████████████| 372 kB 5.4 MB/s \n",
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (1.21.6)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (7.1.2)\n",
"Collecting boto3\n",
" Downloading boto3-1.26.13-py3-none-any.whl (132 kB)\n",
"\u001b[K |████████████████████████████████| 132 kB 39.0 MB/s \n",
"\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (7.1.2)\n",
"Collecting pathos\n",
" Downloading pathos-0.3.0-py3-none-any.whl (79 kB)\n",
"\u001b[K |████████████████████████████████| 79 kB 8.2 MB/s \n",
"\u001b[?25hCollecting humbug>=0.2.6\n",
" Downloading humbug-0.2.7-py3-none-any.whl (11 kB)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deeplake==3.1.0) (4.64.1)\n",
"Collecting numcodecs\n",
" Downloading numcodecs-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n",
"\u001b[K |████████████████████████████████| 6.6 MB 55.2 MB/s \n",
"\u001b[?25hCollecting pyjwt\n",
" Downloading PyJWT-2.6.0-py3-none-any.whl (20 kB)\n",
"Collecting hub>=2.8.7\n",
" Downloading hub-3.0.1-py3-none-any.whl (1.4 kB)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from humbug>=0.2.6->deeplake==3.1.0) (2.23.0)\n",
"Collecting s3transfer<0.7.0,>=0.6.0\n",
" Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n",
"\u001b[K |████████████████████████████████| 79 kB 6.9 MB/s \n",
"\u001b[?25hCollecting botocore<1.30.0,>=1.29.13\n",
" Downloading botocore-1.29.13-py3-none-any.whl (9.9 MB)\n",
"\u001b[K |████████████████████████████████| 9.9 MB 42.0 MB/s \n",
"\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n",
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
"Collecting urllib3<1.27,>=1.25.4\n",
" Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)\n",
"\u001b[K |████████████████████████████████| 140 kB 50.3 MB/s \n",
"\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.30.0,>=1.29.13->boto3->deeplake==3.1.0) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.30.0,>=1.29.13->boto3->deeplake==3.1.0) (1.15.0)\n",
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from numcodecs->deeplake==3.1.0) (0.4)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from numcodecs->deeplake==3.1.0) (4.1.1)\n",
"Collecting pox>=0.3.2\n",
" Downloading pox-0.3.2-py3-none-any.whl (29 kB)\n",
"Collecting multiprocess>=0.70.14\n",
" Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)\n",
"\u001b[K |████████████████████████████████| 115 kB 11.6 MB/s \n",
"\u001b[?25hCollecting ppft>=1.7.6.6\n",
" Downloading ppft-1.7.6.6-py3-none-any.whl (52 kB)\n",
"\u001b[K |████████████████████████████████| 52 kB 1.1 MB/s \n",
"\u001b[?25hRequirement already satisfied: dill>=0.3.6 in /usr/local/lib/python3.7/dist-packages (from pathos->deeplake==3.1.0) (0.3.6)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (3.0.4)\n",
"Collecting urllib3<1.27,>=1.25.4\n",
" Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n",
"\u001b[K |████████████████████████████████| 127 kB 57.1 MB/s \n",
"\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->humbug>=0.2.6->deeplake==3.1.0) (2022.9.24)\n",
"Building wheels for collected packages: deeplake\n",
" Building wheel for deeplake (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for deeplake: filename=deeplake-3.1.0-py3-none-any.whl size=450912 sha256=054ca85337188db1b46a7cc3d26886b90e66c79b50ee3e9a7394a23e6754efd1\n",
" Stored in directory: /root/.cache/pip/wheels/6d/56/88/c401a766a03674c70c4d8a865b37fa3a8d36a6da8ec1b2844a\n",
"Successfully built deeplake\n",
"Installing collected packages: urllib3, jmespath, botocore, s3transfer, ppft, pox, multiprocess, pyjwt, pathos, numcodecs, humbug, hub, boto3, deeplake\n",
" Attempting uninstall: urllib3\n",
" Found existing installation: urllib3 1.24.3\n",
" Uninstalling urllib3-1.24.3:\n",
" Successfully uninstalled urllib3-1.24.3\n",
"Successfully installed boto3-1.26.13 botocore-1.29.13 deeplake-3.1.0 hub-3.0.1 humbug-0.2.7 jmespath-1.0.1 multiprocess-0.70.14 numcodecs-0.10.2 pathos-0.3.0 pox-0.3.2 ppft-1.7.6.6 pyjwt-2.6.0 s3transfer-0.6.0 urllib3-1.25.11\n"
]
}
],
"source": [
"!pip install deeplake==3.1.0"
]
},
{
"cell_type": "code",
"source": [
"import deeplake\n",
"import numpy as np"
],
"metadata": {
"id": "4EcGwZ_eFv-r"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Text tensors are not merged\n",
"When two branches are merged, where both branches has the same tensors, some tensors are empty after merge."
],
"metadata": {
"id": "iLUyluhNIs0V"
}
},
{
"cell_type": "code",
"source": [
"ds = deeplake.empty(\"/tmp/ds1\", overwrite=True)\n",
"\n",
"ds.create_tensor(\"images\", htype=\"image\", sample_compression=\"jpeg\")\n",
"ds.create_tensor(\"labels\", htype=\"class_label\")\n",
"ds.create_tensor(\"description\", htype=\"text\")\n",
"\n",
"root = ds.commit(\"Empty\")\n",
"\n",
"s = np.ones(shape=(3,3,1), dtype=np.uint8)\n",
"\n",
"ds.extend({\n",
" \"images\": [s.copy(), s.copy() * 2, s.copy() * 3],\n",
" \"labels\": [1,2,3],\n",
" \"description\": [\"img1\", \"img2\", \"img3\"],\n",
"})\n",
"ds.commit(\"Add img 1-3 to main\")\n",
"\n",
"ds.checkout(root)\n",
"ds.checkout(\"branch2\", create=True)\n",
"ds.extend({\n",
" \"images\": [s.copy() * 4, s.copy() * 5],\n",
" \"labels\": [4,5],\n",
" \"description\": [\"img4\", \"img5\"],\n",
"})\n",
"ds.commit(\"Add images 4-5 to branch2\")\n",
"\n",
"ds.checkout(\"main\")\n",
"ds.merge(\"branch2\")"
],
"metadata": {
"id": "DRgIUeqxF9ne"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for sample in ds:\n",
" print(f\"Image value: {sample.images.numpy().mean()}\")\n",
" print(f\"Label value: {sample.labels.numpy()}\")\n",
" print(f\"Description: {sample.description.numpy()}\")\n",
" print(\"\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HLOMBLExHsCn",
"outputId": "2ab11a56-dc42-4f67-f064-d95ec1a55704"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Image value: 1.0\n",
"Label value: [1]\n",
"Description: ['img1']\n",
"\n",
"Image value: 2.0\n",
"Label value: [2]\n",
"Description: ['img2']\n",
"\n",
"Image value: 3.0\n",
"Label value: [3]\n",
"Description: ['img3']\n",
"\n",
"Image value: 4.0\n",
"Label value: [4]\n",
"Description: ['']\n",
"\n",
"Image value: 5.0\n",
"Label value: [5]\n",
"Description: ['']\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"ds.checkout(\"branch2\")\n",
"for sample in ds:\n",
" print(f\"Image value: {sample.images.numpy().mean()}\")\n",
" print(f\"Label value: {sample.labels.numpy()}\")\n",
" print(f\"Description: {sample.description.numpy()}\")\n",
" print(\"\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RKYYO5ySICNA",
"outputId": "1f51d970-781a-4b5d-cb5c-2972c2d8d2f4"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Image value: 4.0\n",
"Label value: [4]\n",
"Description: ['img4']\n",
"\n",
"Image value: 5.0\n",
"Label value: [5]\n",
"Description: ['img5']\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Removed samples are not removed after a merge\n",
"Data samples that are removed on a branch does not get removed when that branch is merged.\n",
"\n",
"This might be a design decision, that when merging it is the current state of the source branch that is merged, not the changes/deltas on the branch. This \"union\" behavior feels very unnatural for a \"merge\" and is not how it works in e.g. git."
],
"metadata": {
"id": "L78vAjQQJPDr"
}
},
{
"cell_type": "code",
"source": [
"ds = deeplake.empty(\"/tmp/ds1\", overwrite=True)\n",
"\n",
"ds.create_tensor(\"labels\", htype=\"class_label\")\n",
"\n",
"s = np.ones(shape=(3,3,1), dtype=np.uint8)\n",
"\n",
"ds.extend({\n",
" \"labels\": [1,2,3],\n",
"})\n",
"ds.commit(\"Add values 1-3 to main\")\n",
"\n",
"ds.checkout(\"branch2\", create=True)\n",
"ds.pop(0)\n",
"ds.commit(\"Removed value '1'\")\n",
"\n",
"ds.checkout(\"main\")\n",
"ds.merge(\"branch2\")"
],
"metadata": {
"id": "ed2I0vArImTL"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for sample in ds:\n",
" print(f\"Label value: {sample.labels.numpy()}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sc8mpO38JpkP",
"outputId": "8e7d6610-e99e-4589-9839-bd3da7d79470"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Label value: [1]\n",
"Label value: [2]\n",
"Label value: [3]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"ds.commits"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "S9oOZ15VJ8BP",
"outputId": "5a3401c9-b990-4595-b46a-fde72504fd99"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'commit': 'c521b58d5f1a011c31cd823bf72571facc84a1c0',\n",
" 'author': 'public',\n",
" 'time': '2022-11-20 09:30:02',\n",
" 'message': 'Merge 6a351c7388635a1bf1e2a26ee50c568704c80116 into main'},\n",
" {'commit': 'firstdbf9474d461a19e9333c2fd19b46115348f',\n",
" 'author': 'public',\n",
" 'time': '2022-11-20 09:30:01',\n",
" 'message': 'Add values 1-3 to main'}]"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"ds.checkout(\"branch2\")\n",
"for sample in ds:\n",
" print(f\"Label value: {sample.labels.numpy()}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "u4Z0UFVUJudy",
"outputId": "2c4f0a25-3638-45f1-9822-bbe5140bc461"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Label value: [2]\n",
"Label value: [3]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"ds.commits"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "I0XAso2FJy8b",
"outputId": "4a098426-5645-4347-d436-ec75b3aebba8"
},
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'commit': '6a351c7388635a1bf1e2a26ee50c568704c80116',\n",
" 'author': 'public',\n",
" 'time': '2022-11-20 09:30:01',\n",
" 'message': \"Removed value '1'\"},\n",
" {'commit': 'firstdbf9474d461a19e9333c2fd19b46115348f',\n",
" 'author': 'public',\n",
" 'time': '2022-11-20 09:30:01',\n",
" 'message': 'Add values 1-3 to main'}]"
]
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "8BArGquQKBh0"
},
"execution_count": 10,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment