zredlined/synthetic-data-uber-differential-privacy.ipynb Secret

## synthetic-data-uber-differential-privacy.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "name": "synthetic-data-uber-differential-privacy.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "7036165ddd6e406289ddfa6610f1f19b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_9be48008d0754ac5ae4d33132c6cfa5e",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_367912fab4e0483a91c84d737d1a5803",
              "IPY_MODEL_9dabe6f558d74b3892d4da8b54a5e2b1"
            ]
          }
        },
        "9be48008d0754ac5ae4d33132c6cfa5e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "367912fab4e0483a91c84d737d1a5803": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_894fbbdae4fc487c884827dec7156adf",
            "_dom_classes": [],
            "description": "Valid record count : 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 27386,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 27386,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_5da2eda343bb49c0a5c3b84de38c0744"
          }
        },
        "9dabe6f558d74b3892d4da8b54a5e2b1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_3c936bd172944555be41674808b7b1b7",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 27386/27386 [1:02:48&lt;00:00,  7.27it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_5f0ac9107d52498fb3b6fa41ae98b363"
          }
        },
        "894fbbdae4fc487c884827dec7156adf": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "5da2eda343bb49c0a5c3b84de38c0744": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "3c936bd172944555be41674808b7b1b7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "5f0ac9107d52498fb3b6fa41ae98b363": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5142e259909d4b71bd49574b3d900d7d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_a3d407c6a9274a40b4ded83977f95740",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_9c497124023a4b28bcbe81f79f30cc06",
              "IPY_MODEL_a5fcfb4576f541919f5aea42c9c7ad41"
            ]
          }
        },
        "a3d407c6a9274a40b4ded83977f95740": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9c497124023a4b28bcbe81f79f30cc06": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_111ff0d9947946c2a81899e410d99c1f",
            "_dom_classes": [],
            "description": "Invalid record count :   3%",
            "_model_name": "FloatProgressModel",
            "bar_style": "danger",
            "max": 100000,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 3111,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_93a45a83fd5f4099aaa91f2285482ab5"
          }
        },
        "a5fcfb4576f541919f5aea42c9c7ad41": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_fd1680c8bd28460d9acac29c6dbabcec",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 3111/100000.0 [1:02:48&lt;32:35:58,  1.21s/it]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d76e9dbe30cb49c1ab947da70337d14d"
          }
        },
        "111ff0d9947946c2a81899e410d99c1f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "93a45a83fd5f4099aaa91f2285482ab5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "fd1680c8bd28460d9acac29c6dbabcec": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d76e9dbe30cb49c1ab947da70337d14d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/zredlined/7692fe2bb8999f933f64e62cf912245e/synthetic-data-uber-differential-privacy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UTRxpSlaczHY"
      },
      "source": [
        "# A differentially private, synthetic ride-share dataset\n",
        "\n",
        "This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VEM6kjRsczHd"
      },
      "source": [
        "%%capture\n",
        "\n",
        "!pip install -U gretel-client gretel-synthetics pandas"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZQ-TmAdwczHd",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "d7394061-8949-4565-9acc-c1641f0f0c1b"
      },
      "source": [
        "# Load your Gretel API key. You can acquire this from the Gretel Console \n",
        "# @ https://console.gretel.cloud\n",
        "\n",
        "import pandas as pd\n",
        "from gretel_client import get_cloud_client\n",
        "\n",
        "\n",
        "pd.set_option('max_colwidth', None)\n",
        "\n",
        "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n",
        "client.install_packages()"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Enter Gretel API key: ··········\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "INFO pkg_installers.py: Authenticating with package manager\n",
            "INFO pkg_installers.py: Installing packages (this might take a while)\n",
            "ERROR pkg_installers.py: /usr/bin/python3 -m pip --disable-pip-version-check install https://gretel-opt-prod-usw2.s3.amazonaws.com/priv/pip/gretel-helpers/0.8.2/gretel_helpers-0.8.2-py3-none-any.whl?AWSAccessKeyId=ASIARC2BUADHWQNNPWPH&Signature=fgkcseuoHN8WT%2BRUHaDAPFiEJSQ%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIQD5i4JXTHxf76VePnP08YG6Do%2BoBXjAZ1%2BjKKESM1xlGgIgIgCUURqBPwxV8ObphN9BYd7ygpoXx0mTzJNvRyWqgTsq3gEIIBACGgwwNzQ3NjI2ODI1NzUiDJO7ocKPryKX3%2FZrGyq7AemytcJJVrrpUFgKRCtpEiqEHoH5IyGDSVicc5viQFcwfZu0%2FG6lcPnXjuSzNM6AjVgP1hJEklIOQ3NIFmSisbsID0Bw69p%2BbjHJD3AnQzVYjutX7HBU7Zt94mATAOP6TXXgrrCA8h2sh81YF2KrHjuAFWrdRCwu8VBtOmBJl2pOHI6otCVlXOB1%2Fd8ni2i513ZusjdzspFtzOgm5D%2FuDz2VBYOUsMuM5he2qUWTqmRNbvonGfaqyVNq%2BpcwreCXhAY64AEjU5Z7sXRgh3X7Ipe4FCpkxrNNwL5py8PgwN9tkxrqha1oTlpFZ5Fi%2FoY11GqNuCXMAriYcP6%2FgElrF61pEH9dDAPzeLk9j6wK6HI94%2Fp2%2FPJKg7V6YCcVmzfzrnc0fZ5iHMO2bJQ3tzfz2fqi0D5S%2FyGdK02PW%2BpxnpxR3d%2F7h8GM3RfMdpQhucRxUXAa0f%2FzNzPlO%2Bhm0tVMX9IvYSlTYShMu3zEpK%2FXXSO5BIsefakfgjvvbAFMfcx1fOjGP5W%2B34eRdLIbQPNZDIvQTcioWb7zdSZ07G5SkwXgz4sLNw%3D%3D&Expires=1619394274\n",
            "ERROR pkg_installers.py: ERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.25.1 which is incompatible.\n",
            "ERROR pkg_installers.py: \n",
            "INFO pkg_installers.py: Finished installing Gretel packages\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YMg9nX6SczHe",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 419
        },
        "outputId": "5c9ac289-0493-4759-cb2f-8a6bda6656aa"
      },
      "source": [
        "# Load and preview dataset\n",
        "\n",
        "import logging\n",
        "import pandas as pd\n",
        "\n",
        "\n",
        "logging.basicConfig(level=logging.DEBUG)\n",
        "\n",
        "dataset_path = 'https://gretel-public-website.s3.amazonaws.com/datasets/uber_dataset_with_canaries.csv'\n",
        "training_df = pd.read_csv(dataset_path).round(5)\n",
        "training_df"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>hour</th>\n",
              "      <th>bike_id</th>\n",
              "      <th>src_lat</th>\n",
              "      <th>src_lon</th>\n",
              "      <th>dst_lat</th>\n",
              "      <th>dst_lon</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>23</td>\n",
              "      <td>27018</td>\n",
              "      <td>34.01698</td>\n",
              "      <td>-118.50102</td>\n",
              "      <td>34.0265</td>\n",
              "      <td>-118.49686</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>11</td>\n",
              "      <td>55026</td>\n",
              "      <td>47.55661</td>\n",
              "      <td>-122.2713</td>\n",
              "      <td>47.57012</td>\n",
              "      <td>-122.29086</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>21</td>\n",
              "      <td>50241</td>\n",
              "      <td>38.93048</td>\n",
              "      <td>-77.03244</td>\n",
              "      <td>38.94392</td>\n",
              "      <td>-77.03337</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>2</td>\n",
              "      <td>31898</td>\n",
              "      <td>37.79193</td>\n",
              "      <td>-122.40047</td>\n",
              "      <td>37.79389</td>\n",
              "      <td>-122.42464</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>16</td>\n",
              "      <td>XEY338</td>\n",
              "      <td>33.99552</td>\n",
              "      <td>-118.44952</td>\n",
              "      <td>34.00123</td>\n",
              "      <td>-118.43805</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27381</th>\n",
              "      <td>20</td>\n",
              "      <td>XIT762</td>\n",
              "      <td>30.29091</td>\n",
              "      <td>-97.74907</td>\n",
              "      <td>30.29081</td>\n",
              "      <td>-97.74548</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27382</th>\n",
              "      <td>7</td>\n",
              "      <td>BQN803</td>\n",
              "      <td>38.57168</td>\n",
              "      <td>-121.46315</td>\n",
              "      <td>38.56798</td>\n",
              "      <td>-121.46044</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27383</th>\n",
              "      <td>0</td>\n",
              "      <td>GUF685</td>\n",
              "      <td>38.91714</td>\n",
              "      <td>-77.04085</td>\n",
              "      <td>38.92376</td>\n",
              "      <td>-77.04086</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27384</th>\n",
              "      <td>21</td>\n",
              "      <td>SYR196</td>\n",
              "      <td>38.55243</td>\n",
              "      <td>-121.4696</td>\n",
              "      <td>38.57831</td>\n",
              "      <td>-121.48649</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>27385</th>\n",
              "      <td>4</td>\n",
              "      <td>PUD261</td>\n",
              "      <td>27.9742</td>\n",
              "      <td>-82.44629</td>\n",
              "      <td>27.97691</td>\n",
              "      <td>-82.44676</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>27386 rows × 6 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "      hour  bike_id   src_lat     src_lon   dst_lat     dst_lon\n",
              "0       23    27018  34.01698  -118.50102   34.0265  -118.49686\n",
              "1       11    55026  47.55661   -122.2713  47.57012  -122.29086\n",
              "2       21    50241  38.93048   -77.03244  38.94392   -77.03337\n",
              "3        2    31898  37.79193  -122.40047  37.79389  -122.42464\n",
              "4       16   XEY338  33.99552  -118.44952  34.00123  -118.43805\n",
              "...    ...      ...       ...         ...       ...         ...\n",
              "27381   20   XIT762  30.29091   -97.74907  30.29081   -97.74548\n",
              "27382    7   BQN803  38.57168  -121.46315  38.56798  -121.46044\n",
              "27383    0   GUF685  38.91714   -77.04085  38.92376   -77.04086\n",
              "27384   21   SYR196  38.55243   -121.4696  38.57831  -121.48649\n",
              "27385    4   PUD261   27.9742   -82.44629  27.97691   -82.44676\n",
              "\n",
              "[27386 rows x 6 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "O4-E_F0qczHe"
      },
      "source": [
        "# Create the Gretel Synthtetics Training / Model Configuration\n",
        "\n",
        "from pathlib import Path\n",
        "\n",
        "checkpoint_dir = str(Path.cwd() / \"checkpoints-dp\")\n",
        "\n",
        "config_template = {\n",
        "    \"checkpoint_dir\": checkpoint_dir,\n",
        "    \"vocab_size\": 0,\n",
        "    \"epochs\": 50,\n",
        "    \"early_stopping\": True,\n",
        "    \"learning_rate\": 0.001,\n",
        "    \"rnn_units\": 256,\n",
        "    \"batch_size\": 4,\n",
        "    \"predict_batch_size\": 1,\n",
        "    \"dp\": True,\n",
        "    \"dp_noise_multiplier\": 0.001, # set low to demonstrate gradient clipping\n",
        "    \"dp_l2_norm_clip\": 1.5,\n",
        "    \"dp_microbatches\": 1,    \n",
        "    \"overwrite\": True\n",
        "}"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CCW-JaiNczHf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "7036165ddd6e406289ddfa6610f1f19b",
            "9be48008d0754ac5ae4d33132c6cfa5e",
            "367912fab4e0483a91c84d737d1a5803",
            "9dabe6f558d74b3892d4da8b54a5e2b1",
            "894fbbdae4fc487c884827dec7156adf",
            "5da2eda343bb49c0a5c3b84de38c0744",
            "3c936bd172944555be41674808b7b1b7",
            "5f0ac9107d52498fb3b6fa41ae98b363",
            "5142e259909d4b71bd49574b3d900d7d",
            "a3d407c6a9274a40b4ded83977f95740",
            "9c497124023a4b28bcbe81f79f30cc06",
            "a5fcfb4576f541919f5aea42c9c7ad41",
            "111ff0d9947946c2a81899e410d99c1f",
            "93a45a83fd5f4099aaa91f2285482ab5",
            "fd1680c8bd28460d9acac29c6dbabcec",
            "d76e9dbe30cb49c1ab947da70337d14d"
          ]
        },
        "outputId": "cec66ac5-4896-4e8d-cb87-83c245f2c5d1"
      },
      "source": [
        "# Create a Gretel Synthetic Data Bundle\n",
        "\n",
        "try:\n",
        "    # Capture transient import errors in Google Colab\n",
        "    from gretel_helpers.series_models import SeriesModel\n",
        "except FileNotFoundError:\n",
        "    from gretel_helpers.series_models import SeriesModel\n",
        "    \n",
        "\n",
        "# Use these values as a prompt to seed each record versus random generation\n",
        "seed_columns = [\"hour\", \"bike_id\"]\n",
        "\n",
        "model = SeriesModel(\n",
        "    training_df=training_df,\n",
        "    seed_columns=seed_columns,\n",
        "    synthetic_config=config_template\n",
        ")\n",
        "\n",
        "model.train()\n",
        "model.generate(max_invalid=1e5)"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "INFO model.py: Detecting record field delimiter...\n",
            "INFO model.py: Analyzing DataFrame for optimal column batches and ordering...\n",
            "INFO model.py: Creating model and data storage directories...\n",
            "INFO batch.py: Creating directory structure for batch jobs...\n",
            "INFO model.py: Generating training data from source dataset...\n",
            "INFO batch.py: Generating training DF and CSV for batch 0\n",
            "INFO model.py: Creating data validators...\n",
            "INFO model.py: Creating validator for synthetic batch 0\n",
            "\n",
            "\n",
            "  0%|          | 0/27386 [00:00<?, ?it/s]\u001b[A\u001b[A\n",
            "\n",
            " 43%|████▎     | 11657/27386 [00:00<00:00, 116567.33it/s]\u001b[A\u001b[A\n",
            "\n",
            "100%|██████████| 27386/27386 [00:00<00:00, 108339.90it/s]\n",
            "WARNING dp_model.py: Experimental: Differentially private training enabled\n",
            "WARNING dp_model.py: ******* Patching TensorFlow to utilize new Keras code paths, see: https://github.com/tensorflow/tensorflow/issues/44917 *******\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_2\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_2 (Embedding)      (4, None, 256)            16896     \n",
            "_________________________________________________________________\n",
            "dropout_6 (Dropout)          (4, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_4 (LSTM)                (4, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_7 (Dropout)          (4, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_5 (LSTM)                (4, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_8 (Dropout)          (4, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "dense_2 (Dense)              (4, None, 66)             16962     \n",
            "=================================================================\n",
            "Total params: 1,084,482\n",
            "Trainable params: 1,084,482\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Epoch 1/50\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "WARNING control_flow_ops.py: Converting IndexedSlices(indices=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape_1:0\", shape=(400,), dtype=int32), values=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape:0\", shape=(400, 256), dtype=float32), dense_shape=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/VariableShape:0\", shape=(2,), dtype=int32)) to a dense representation may make it slow. Alternatively, output the indices and values of the IndexedSlices separately, and handle the vectorized outputs directly.\n",
            "WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
            "WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
            "WARNING control_flow_ops.py: Converting IndexedSlices(indices=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape_1:0\", shape=(400,), dtype=int32), values=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape:0\", shape=(400, 256), dtype=float32), dense_shape=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/VariableShape:0\", shape=(2,), dtype=int32)) to a dense representation may make it slow. Alternatively, output the indices and values of the IndexedSlices separately, and handle the vectorized outputs directly.\n",
            "WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
            "WARNING pfor.py: Using a while_loop for converting StatelessCase\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "3310/3310 [==============================] - 57s 16ms/step - loss: 2.2035 - accuracy: 0.2866\n",
            "Epoch 2/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.3461 - accuracy: 0.5335\n",
            "Epoch 3/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2719 - accuracy: 0.5525\n",
            "Epoch 4/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2480 - accuracy: 0.5586\n",
            "Epoch 5/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2360 - accuracy: 0.5615\n",
            "Epoch 6/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2274 - accuracy: 0.5642\n",
            "Epoch 7/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2224 - accuracy: 0.5651\n",
            "Epoch 8/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2160 - accuracy: 0.5670\n",
            "Epoch 9/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2139 - accuracy: 0.5675\n",
            "Epoch 10/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2124 - accuracy: 0.5679\n",
            "Epoch 11/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2093 - accuracy: 0.5692\n",
            "Epoch 12/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2092 - accuracy: 0.5698\n",
            "Epoch 13/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2051 - accuracy: 0.5704\n",
            "Epoch 14/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2034 - accuracy: 0.5710\n",
            "Epoch 15/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2017 - accuracy: 0.5716\n",
            "Epoch 16/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.2004 - accuracy: 0.5726\n",
            "Epoch 17/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1996 - accuracy: 0.5719\n",
            "Epoch 18/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1988 - accuracy: 0.5727\n",
            "Epoch 19/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1983 - accuracy: 0.5723\n",
            "Epoch 20/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1977 - accuracy: 0.5724\n",
            "Epoch 21/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1974 - accuracy: 0.5728\n",
            "Epoch 22/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1941 - accuracy: 0.5737\n",
            "Epoch 23/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1956 - accuracy: 0.5734\n",
            "Epoch 24/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1949 - accuracy: 0.5740\n",
            "Epoch 25/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1934 - accuracy: 0.5741\n",
            "Epoch 26/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1927 - accuracy: 0.5737\n",
            "Epoch 27/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1922 - accuracy: 0.5742\n",
            "Epoch 28/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1912 - accuracy: 0.5745\n",
            "Epoch 29/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1931 - accuracy: 0.5744\n",
            "Epoch 30/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1928 - accuracy: 0.5737\n",
            "Epoch 31/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1923 - accuracy: 0.5751\n",
            "Epoch 32/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1919 - accuracy: 0.5743\n",
            "Epoch 33/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1918 - accuracy: 0.5748\n",
            "Epoch 34/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1914 - accuracy: 0.5746\n",
            "Epoch 35/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1902 - accuracy: 0.5747\n",
            "Epoch 36/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1906 - accuracy: 0.5747\n",
            "Epoch 37/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1899 - accuracy: 0.5747\n",
            "Epoch 38/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1894 - accuracy: 0.5751\n",
            "Epoch 39/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1893 - accuracy: 0.5751\n",
            "Epoch 40/50\n",
            "3310/3310 [==============================] - 54s 16ms/step - loss: 1.1876 - accuracy: 0.5758\n",
            "Epoch 41/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1901 - accuracy: 0.5749\n",
            "Epoch 42/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1890 - accuracy: 0.5750\n",
            "Epoch 43/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1893 - accuracy: 0.5748\n",
            "Epoch 44/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1890 - accuracy: 0.5746\n",
            "Epoch 45/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1888 - accuracy: 0.5756\n",
            "Epoch 46/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1895 - accuracy: 0.5744\n",
            "Epoch 47/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1884 - accuracy: 0.5746\n",
            "Epoch 48/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1883 - accuracy: 0.5746\n",
            "Epoch 49/50\n",
            "3310/3310 [==============================] - 53s 16ms/step - loss: 1.1878 - accuracy: 0.5751\n",
            "Epoch 50/50\n",
            "3310/3310 [==============================] - 54s 16ms/step - loss: 1.1886 - accuracy: 0.5749\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7036165ddd6e406289ddfa6610f1f19b",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Valid record count ', max=27386.0, style=ProgressStyle(de…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "5142e259909d4b71bd49574b3d900d7d",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Invalid record count ', max=100000.0, style=ProgressStyle…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "WARNING dp_model.py: Experimental: Differentially private training enabled\n",
            "WARNING dp_model.py: ******* Patching TensorFlow to utilize new Keras code paths, see: https://github.com/tensorflow/tensorflow/issues/44917 *******\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_3\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_3 (Embedding)      (1, None, 256)            16896     \n",
            "_________________________________________________________________\n",
            "dropout_9 (Dropout)          (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_6 (LSTM)                (1, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_10 (Dropout)         (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_7 (LSTM)                (1, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_11 (Dropout)         (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "dense_3 (Dense)              (1, None, 66)             16962     \n",
            "=================================================================\n",
            "Total params: 1,084,482\n",
            "Trainable params: 1,084,482\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n",
            "Model: \"sequential_3\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_3 (Embedding)      (1, None, 256)            16896     \n",
            "_________________________________________________________________\n",
            "dropout_9 (Dropout)          (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_6 (LSTM)                (1, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_10 (Dropout)         (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "lstm_7 (LSTM)                (1, None, 256)            525312    \n",
            "_________________________________________________________________\n",
            "dropout_11 (Dropout)         (1, None, 256)            0         \n",
            "_________________________________________________________________\n",
            "dense_3 (Dense)              (1, None, 66)             16962     \n",
            "=================================================================\n",
            "Total params: 1,084,482\n",
            "Trainable params: 1,084,482\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "\n",
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<gretel_helpers.series_models.SeriesModel at 0x7fa1459a00d0>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "srW1HBA-d3Mp"
      },
      "source": [
        "# Save synthetic dataframe locally and save to CSV \n",
        "\n",
        "df = model.df\n",
        "df.to_csv('synthetic-data.csv', index=False)"
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "W5BhlCaoKGhn",
        "outputId": "6d7b7936-3e7b-4ae4-d652-98553453f107"
      },
      "source": [
        "secrets = [85.31243, 80.71705, 84.98992, 63.20242]\n",
        "\n",
        "# Find the canaries that were replayed by our model\n",
        "def find_canaries(df, secrets):\n",
        "    raw = df.to_string()\n",
        "    for secret in secrets:\n",
        "        print(f\"secret {secret} : found {raw.count(str(secret))} times\")\n",
        "\n",
        "print(\"searching for canaries in training set...\")        \n",
        "find_canaries(training_df, secrets)\n",
        "print(\"searching for canaries in synthetic set...\")        \n",
        "find_canaries(df, secrets)\n"
      ],
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "searching for canaries in training set...\n",
            "secret 85.31243 : found 7 times\n",
            "secret 80.71705 : found 30 times\n",
            "secret 84.98992 : found 93 times\n",
            "secret 63.20242 : found 141 times\n",
            "searching for canaries in synthetic set...\n",
            "secret 85.31243 : found 0 times\n",
            "secret 80.71705 : found 0 times\n",
            "secret 84.98992 : found 0 times\n",
            "secret 63.20242 : found 0 times\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}