Skip to content

Instantly share code, notes, and snippets.

@zredlined
Last active April 26, 2021 05:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zredlined/7692fe2bb8999f933f64e62cf912245e to your computer and use it in GitHub Desktop.
Save zredlined/7692fe2bb8999f933f64e62cf912245e to your computer and use it in GitHub Desktop.
synthetic-data-uber-differential-privacy.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "synthetic-data-uber-differential-privacy.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"7036165ddd6e406289ddfa6610f1f19b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_9be48008d0754ac5ae4d33132c6cfa5e",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_367912fab4e0483a91c84d737d1a5803",
"IPY_MODEL_9dabe6f558d74b3892d4da8b54a5e2b1"
]
}
},
"9be48008d0754ac5ae4d33132c6cfa5e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"367912fab4e0483a91c84d737d1a5803": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_894fbbdae4fc487c884827dec7156adf",
"_dom_classes": [],
"description": "Valid record count : 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 27386,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 27386,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5da2eda343bb49c0a5c3b84de38c0744"
}
},
"9dabe6f558d74b3892d4da8b54a5e2b1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_3c936bd172944555be41674808b7b1b7",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 27386/27386 [1:02:48<00:00, 7.27it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5f0ac9107d52498fb3b6fa41ae98b363"
}
},
"894fbbdae4fc487c884827dec7156adf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"5da2eda343bb49c0a5c3b84de38c0744": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3c936bd172944555be41674808b7b1b7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"5f0ac9107d52498fb3b6fa41ae98b363": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5142e259909d4b71bd49574b3d900d7d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_a3d407c6a9274a40b4ded83977f95740",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_9c497124023a4b28bcbe81f79f30cc06",
"IPY_MODEL_a5fcfb4576f541919f5aea42c9c7ad41"
]
}
},
"a3d407c6a9274a40b4ded83977f95740": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9c497124023a4b28bcbe81f79f30cc06": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_111ff0d9947946c2a81899e410d99c1f",
"_dom_classes": [],
"description": "Invalid record count : 3%",
"_model_name": "FloatProgressModel",
"bar_style": "danger",
"max": 100000,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 3111,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_93a45a83fd5f4099aaa91f2285482ab5"
}
},
"a5fcfb4576f541919f5aea42c9c7ad41": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_fd1680c8bd28460d9acac29c6dbabcec",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 3111/100000.0 [1:02:48<32:35:58, 1.21s/it]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d76e9dbe30cb49c1ab947da70337d14d"
}
},
"111ff0d9947946c2a81899e410d99c1f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"93a45a83fd5f4099aaa91f2285482ab5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"fd1680c8bd28460d9acac29c6dbabcec": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d76e9dbe30cb49c1ab947da70337d14d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/zredlined/7692fe2bb8999f933f64e62cf912245e/synthetic-data-uber-differential-privacy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UTRxpSlaczHY"
},
"source": [
"# A differentially private, synthetic ride-share dataset\n",
"\n",
"This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns."
]
},
{
"cell_type": "code",
"metadata": {
"id": "VEM6kjRsczHd"
},
"source": [
"%%capture\n",
"\n",
"!pip install -U gretel-client gretel-synthetics pandas"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZQ-TmAdwczHd",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d7394061-8949-4565-9acc-c1641f0f0c1b"
},
"source": [
"# Load your Gretel API key. You can acquire this from the Gretel Console \n",
"# @ https://console.gretel.cloud\n",
"\n",
"import pandas as pd\n",
"from gretel_client import get_cloud_client\n",
"\n",
"\n",
"pd.set_option('max_colwidth', None)\n",
"\n",
"client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n",
"client.install_packages()"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Enter Gretel API key: ··········\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"INFO pkg_installers.py: Authenticating with package manager\n",
"INFO pkg_installers.py: Installing packages (this might take a while)\n",
"ERROR pkg_installers.py: /usr/bin/python3 -m pip --disable-pip-version-check install https://gretel-opt-prod-usw2.s3.amazonaws.com/priv/pip/gretel-helpers/0.8.2/gretel_helpers-0.8.2-py3-none-any.whl?AWSAccessKeyId=ASIARC2BUADHWQNNPWPH&Signature=fgkcseuoHN8WT%2BRUHaDAPFiEJSQ%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIQD5i4JXTHxf76VePnP08YG6Do%2BoBXjAZ1%2BjKKESM1xlGgIgIgCUURqBPwxV8ObphN9BYd7ygpoXx0mTzJNvRyWqgTsq3gEIIBACGgwwNzQ3NjI2ODI1NzUiDJO7ocKPryKX3%2FZrGyq7AemytcJJVrrpUFgKRCtpEiqEHoH5IyGDSVicc5viQFcwfZu0%2FG6lcPnXjuSzNM6AjVgP1hJEklIOQ3NIFmSisbsID0Bw69p%2BbjHJD3AnQzVYjutX7HBU7Zt94mATAOP6TXXgrrCA8h2sh81YF2KrHjuAFWrdRCwu8VBtOmBJl2pOHI6otCVlXOB1%2Fd8ni2i513ZusjdzspFtzOgm5D%2FuDz2VBYOUsMuM5he2qUWTqmRNbvonGfaqyVNq%2BpcwreCXhAY64AEjU5Z7sXRgh3X7Ipe4FCpkxrNNwL5py8PgwN9tkxrqha1oTlpFZ5Fi%2FoY11GqNuCXMAriYcP6%2FgElrF61pEH9dDAPzeLk9j6wK6HI94%2Fp2%2FPJKg7V6YCcVmzfzrnc0fZ5iHMO2bJQ3tzfz2fqi0D5S%2FyGdK02PW%2BpxnpxR3d%2F7h8GM3RfMdpQhucRxUXAa0f%2FzNzPlO%2Bhm0tVMX9IvYSlTYShMu3zEpK%2FXXSO5BIsefakfgjvvbAFMfcx1fOjGP5W%2B34eRdLIbQPNZDIvQTcioWb7zdSZ07G5SkwXgz4sLNw%3D%3D&Expires=1619394274\n",
"ERROR pkg_installers.py: ERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.25.1 which is incompatible.\n",
"ERROR pkg_installers.py: \n",
"INFO pkg_installers.py: Finished installing Gretel packages\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "YMg9nX6SczHe",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 419
},
"outputId": "5c9ac289-0493-4759-cb2f-8a6bda6656aa"
},
"source": [
"# Load and preview dataset\n",
"\n",
"import logging\n",
"import pandas as pd\n",
"\n",
"\n",
"logging.basicConfig(level=logging.DEBUG)\n",
"\n",
"dataset_path = 'https://gretel-public-website.s3.amazonaws.com/datasets/uber_dataset_with_canaries.csv'\n",
"training_df = pd.read_csv(dataset_path).round(5)\n",
"training_df"
],
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hour</th>\n",
" <th>bike_id</th>\n",
" <th>src_lat</th>\n",
" <th>src_lon</th>\n",
" <th>dst_lat</th>\n",
" <th>dst_lon</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>23</td>\n",
" <td>27018</td>\n",
" <td>34.01698</td>\n",
" <td>-118.50102</td>\n",
" <td>34.0265</td>\n",
" <td>-118.49686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11</td>\n",
" <td>55026</td>\n",
" <td>47.55661</td>\n",
" <td>-122.2713</td>\n",
" <td>47.57012</td>\n",
" <td>-122.29086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>21</td>\n",
" <td>50241</td>\n",
" <td>38.93048</td>\n",
" <td>-77.03244</td>\n",
" <td>38.94392</td>\n",
" <td>-77.03337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>31898</td>\n",
" <td>37.79193</td>\n",
" <td>-122.40047</td>\n",
" <td>37.79389</td>\n",
" <td>-122.42464</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>16</td>\n",
" <td>XEY338</td>\n",
" <td>33.99552</td>\n",
" <td>-118.44952</td>\n",
" <td>34.00123</td>\n",
" <td>-118.43805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27381</th>\n",
" <td>20</td>\n",
" <td>XIT762</td>\n",
" <td>30.29091</td>\n",
" <td>-97.74907</td>\n",
" <td>30.29081</td>\n",
" <td>-97.74548</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27382</th>\n",
" <td>7</td>\n",
" <td>BQN803</td>\n",
" <td>38.57168</td>\n",
" <td>-121.46315</td>\n",
" <td>38.56798</td>\n",
" <td>-121.46044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27383</th>\n",
" <td>0</td>\n",
" <td>GUF685</td>\n",
" <td>38.91714</td>\n",
" <td>-77.04085</td>\n",
" <td>38.92376</td>\n",
" <td>-77.04086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27384</th>\n",
" <td>21</td>\n",
" <td>SYR196</td>\n",
" <td>38.55243</td>\n",
" <td>-121.4696</td>\n",
" <td>38.57831</td>\n",
" <td>-121.48649</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27385</th>\n",
" <td>4</td>\n",
" <td>PUD261</td>\n",
" <td>27.9742</td>\n",
" <td>-82.44629</td>\n",
" <td>27.97691</td>\n",
" <td>-82.44676</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27386 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" hour bike_id src_lat src_lon dst_lat dst_lon\n",
"0 23 27018 34.01698 -118.50102 34.0265 -118.49686\n",
"1 11 55026 47.55661 -122.2713 47.57012 -122.29086\n",
"2 21 50241 38.93048 -77.03244 38.94392 -77.03337\n",
"3 2 31898 37.79193 -122.40047 37.79389 -122.42464\n",
"4 16 XEY338 33.99552 -118.44952 34.00123 -118.43805\n",
"... ... ... ... ... ... ...\n",
"27381 20 XIT762 30.29091 -97.74907 30.29081 -97.74548\n",
"27382 7 BQN803 38.57168 -121.46315 38.56798 -121.46044\n",
"27383 0 GUF685 38.91714 -77.04085 38.92376 -77.04086\n",
"27384 21 SYR196 38.55243 -121.4696 38.57831 -121.48649\n",
"27385 4 PUD261 27.9742 -82.44629 27.97691 -82.44676\n",
"\n",
"[27386 rows x 6 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "O4-E_F0qczHe"
},
"source": [
"# Create the Gretel Synthtetics Training / Model Configuration\n",
"\n",
"from pathlib import Path\n",
"\n",
"checkpoint_dir = str(Path.cwd() / \"checkpoints-dp\")\n",
"\n",
"config_template = {\n",
" \"checkpoint_dir\": checkpoint_dir,\n",
" \"vocab_size\": 0,\n",
" \"epochs\": 50,\n",
" \"early_stopping\": True,\n",
" \"learning_rate\": 0.001,\n",
" \"rnn_units\": 256,\n",
" \"batch_size\": 4,\n",
" \"predict_batch_size\": 1,\n",
" \"dp\": True,\n",
" \"dp_noise_multiplier\": 0.001, # set low to demonstrate gradient clipping\n",
" \"dp_l2_norm_clip\": 1.5,\n",
" \"dp_microbatches\": 1, \n",
" \"overwrite\": True\n",
"}"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CCW-JaiNczHf",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"7036165ddd6e406289ddfa6610f1f19b",
"9be48008d0754ac5ae4d33132c6cfa5e",
"367912fab4e0483a91c84d737d1a5803",
"9dabe6f558d74b3892d4da8b54a5e2b1",
"894fbbdae4fc487c884827dec7156adf",
"5da2eda343bb49c0a5c3b84de38c0744",
"3c936bd172944555be41674808b7b1b7",
"5f0ac9107d52498fb3b6fa41ae98b363",
"5142e259909d4b71bd49574b3d900d7d",
"a3d407c6a9274a40b4ded83977f95740",
"9c497124023a4b28bcbe81f79f30cc06",
"a5fcfb4576f541919f5aea42c9c7ad41",
"111ff0d9947946c2a81899e410d99c1f",
"93a45a83fd5f4099aaa91f2285482ab5",
"fd1680c8bd28460d9acac29c6dbabcec",
"d76e9dbe30cb49c1ab947da70337d14d"
]
},
"outputId": "cec66ac5-4896-4e8d-cb87-83c245f2c5d1"
},
"source": [
"# Create a Gretel Synthetic Data Bundle\n",
"\n",
"try:\n",
" # Capture transient import errors in Google Colab\n",
" from gretel_helpers.series_models import SeriesModel\n",
"except FileNotFoundError:\n",
" from gretel_helpers.series_models import SeriesModel\n",
" \n",
"\n",
"# Use these values as a prompt to seed each record versus random generation\n",
"seed_columns = [\"hour\", \"bike_id\"]\n",
"\n",
"model = SeriesModel(\n",
" training_df=training_df,\n",
" seed_columns=seed_columns,\n",
" synthetic_config=config_template\n",
")\n",
"\n",
"model.train()\n",
"model.generate(max_invalid=1e5)"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"INFO model.py: Detecting record field delimiter...\n",
"INFO model.py: Analyzing DataFrame for optimal column batches and ordering...\n",
"INFO model.py: Creating model and data storage directories...\n",
"INFO batch.py: Creating directory structure for batch jobs...\n",
"INFO model.py: Generating training data from source dataset...\n",
"INFO batch.py: Generating training DF and CSV for batch 0\n",
"INFO model.py: Creating data validators...\n",
"INFO model.py: Creating validator for synthetic batch 0\n",
"\n",
"\n",
" 0%| | 0/27386 [00:00<?, ?it/s]\u001b[A\u001b[A\n",
"\n",
" 43%|████▎ | 11657/27386 [00:00<00:00, 116567.33it/s]\u001b[A\u001b[A\n",
"\n",
"100%|██████████| 27386/27386 [00:00<00:00, 108339.90it/s]\n",
"WARNING dp_model.py: Experimental: Differentially private training enabled\n",
"WARNING dp_model.py: ******* Patching TensorFlow to utilize new Keras code paths, see: https://github.com/tensorflow/tensorflow/issues/44917 *******\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"Model: \"sequential_2\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_2 (Embedding) (4, None, 256) 16896 \n",
"_________________________________________________________________\n",
"dropout_6 (Dropout) (4, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_4 (LSTM) (4, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_7 (Dropout) (4, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_5 (LSTM) (4, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_8 (Dropout) (4, None, 256) 0 \n",
"_________________________________________________________________\n",
"dense_2 (Dense) (4, None, 66) 16962 \n",
"=================================================================\n",
"Total params: 1,084,482\n",
"Trainable params: 1,084,482\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"None\n",
"Epoch 1/50\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"WARNING control_flow_ops.py: Converting IndexedSlices(indices=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape_1:0\", shape=(400,), dtype=int32), values=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape:0\", shape=(400, 256), dtype=float32), dense_shape=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/VariableShape:0\", shape=(2,), dtype=int32)) to a dense representation may make it slow. Alternatively, output the indices and values of the IndexedSlices separately, and handle the vectorized outputs directly.\n",
"WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
"WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
"WARNING control_flow_ops.py: Converting IndexedSlices(indices=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape_1:0\", shape=(400,), dtype=int32), values=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/Reshape:0\", shape=(400, 256), dtype=float32), dense_shape=Tensor(\"gradient_tape/sequential_2/embedding_2/embedding_lookup/VariableShape:0\", shape=(2,), dtype=int32)) to a dense representation may make it slow. Alternatively, output the indices and values of the IndexedSlices separately, and handle the vectorized outputs directly.\n",
"WARNING pfor.py: Using a while_loop for converting StatelessCase\n",
"WARNING pfor.py: Using a while_loop for converting StatelessCase\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"3310/3310 [==============================] - 57s 16ms/step - loss: 2.2035 - accuracy: 0.2866\n",
"Epoch 2/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.3461 - accuracy: 0.5335\n",
"Epoch 3/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2719 - accuracy: 0.5525\n",
"Epoch 4/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2480 - accuracy: 0.5586\n",
"Epoch 5/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2360 - accuracy: 0.5615\n",
"Epoch 6/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2274 - accuracy: 0.5642\n",
"Epoch 7/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2224 - accuracy: 0.5651\n",
"Epoch 8/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2160 - accuracy: 0.5670\n",
"Epoch 9/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2139 - accuracy: 0.5675\n",
"Epoch 10/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2124 - accuracy: 0.5679\n",
"Epoch 11/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2093 - accuracy: 0.5692\n",
"Epoch 12/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2092 - accuracy: 0.5698\n",
"Epoch 13/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2051 - accuracy: 0.5704\n",
"Epoch 14/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2034 - accuracy: 0.5710\n",
"Epoch 15/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2017 - accuracy: 0.5716\n",
"Epoch 16/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.2004 - accuracy: 0.5726\n",
"Epoch 17/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1996 - accuracy: 0.5719\n",
"Epoch 18/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1988 - accuracy: 0.5727\n",
"Epoch 19/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1983 - accuracy: 0.5723\n",
"Epoch 20/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1977 - accuracy: 0.5724\n",
"Epoch 21/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1974 - accuracy: 0.5728\n",
"Epoch 22/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1941 - accuracy: 0.5737\n",
"Epoch 23/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1956 - accuracy: 0.5734\n",
"Epoch 24/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1949 - accuracy: 0.5740\n",
"Epoch 25/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1934 - accuracy: 0.5741\n",
"Epoch 26/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1927 - accuracy: 0.5737\n",
"Epoch 27/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1922 - accuracy: 0.5742\n",
"Epoch 28/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1912 - accuracy: 0.5745\n",
"Epoch 29/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1931 - accuracy: 0.5744\n",
"Epoch 30/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1928 - accuracy: 0.5737\n",
"Epoch 31/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1923 - accuracy: 0.5751\n",
"Epoch 32/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1919 - accuracy: 0.5743\n",
"Epoch 33/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1918 - accuracy: 0.5748\n",
"Epoch 34/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1914 - accuracy: 0.5746\n",
"Epoch 35/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1902 - accuracy: 0.5747\n",
"Epoch 36/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1906 - accuracy: 0.5747\n",
"Epoch 37/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1899 - accuracy: 0.5747\n",
"Epoch 38/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1894 - accuracy: 0.5751\n",
"Epoch 39/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1893 - accuracy: 0.5751\n",
"Epoch 40/50\n",
"3310/3310 [==============================] - 54s 16ms/step - loss: 1.1876 - accuracy: 0.5758\n",
"Epoch 41/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1901 - accuracy: 0.5749\n",
"Epoch 42/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1890 - accuracy: 0.5750\n",
"Epoch 43/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1893 - accuracy: 0.5748\n",
"Epoch 44/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1890 - accuracy: 0.5746\n",
"Epoch 45/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1888 - accuracy: 0.5756\n",
"Epoch 46/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1895 - accuracy: 0.5744\n",
"Epoch 47/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1884 - accuracy: 0.5746\n",
"Epoch 48/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1883 - accuracy: 0.5746\n",
"Epoch 49/50\n",
"3310/3310 [==============================] - 53s 16ms/step - loss: 1.1878 - accuracy: 0.5751\n",
"Epoch 50/50\n",
"3310/3310 [==============================] - 54s 16ms/step - loss: 1.1886 - accuracy: 0.5749\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7036165ddd6e406289ddfa6610f1f19b",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Valid record count ', max=27386.0, style=ProgressStyle(de…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5142e259909d4b71bd49574b3d900d7d",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Invalid record count ', max=100000.0, style=ProgressStyle…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"WARNING dp_model.py: Experimental: Differentially private training enabled\n",
"WARNING dp_model.py: ******* Patching TensorFlow to utilize new Keras code paths, see: https://github.com/tensorflow/tensorflow/issues/44917 *******\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"Model: \"sequential_3\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_3 (Embedding) (1, None, 256) 16896 \n",
"_________________________________________________________________\n",
"dropout_9 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_6 (LSTM) (1, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_10 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_7 (LSTM) (1, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_11 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"dense_3 (Dense) (1, None, 66) 16962 \n",
"=================================================================\n",
"Total params: 1,084,482\n",
"Trainable params: 1,084,482\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"None\n",
"Model: \"sequential_3\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_3 (Embedding) (1, None, 256) 16896 \n",
"_________________________________________________________________\n",
"dropout_9 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_6 (LSTM) (1, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_10 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"lstm_7 (LSTM) (1, None, 256) 525312 \n",
"_________________________________________________________________\n",
"dropout_11 (Dropout) (1, None, 256) 0 \n",
"_________________________________________________________________\n",
"dense_3 (Dense) (1, None, 66) 16962 \n",
"=================================================================\n",
"Total params: 1,084,482\n",
"Trainable params: 1,084,482\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"\n",
"\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<gretel_helpers.series_models.SeriesModel at 0x7fa1459a00d0>"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "srW1HBA-d3Mp"
},
"source": [
"# Save synthetic dataframe locally and save to CSV \n",
"\n",
"df = model.df\n",
"df.to_csv('synthetic-data.csv', index=False)"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "W5BhlCaoKGhn",
"outputId": "6d7b7936-3e7b-4ae4-d652-98553453f107"
},
"source": [
"secrets = [85.31243, 80.71705, 84.98992, 63.20242]\n",
"\n",
"# Find the canaries that were replayed by our model\n",
"def find_canaries(df, secrets):\n",
" raw = df.to_string()\n",
" for secret in secrets:\n",
" print(f\"secret {secret} : found {raw.count(str(secret))} times\")\n",
"\n",
"print(\"searching for canaries in training set...\") \n",
"find_canaries(training_df, secrets)\n",
"print(\"searching for canaries in synthetic set...\") \n",
"find_canaries(df, secrets)\n"
],
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"text": [
"searching for canaries in training set...\n",
"secret 85.31243 : found 7 times\n",
"secret 80.71705 : found 30 times\n",
"secret 84.98992 : found 93 times\n",
"secret 63.20242 : found 141 times\n",
"searching for canaries in synthetic set...\n",
"secret 85.31243 : found 0 times\n",
"secret 80.71705 : found 0 times\n",
"secret 84.98992 : found 0 times\n",
"secret 63.20242 : found 0 times\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment