Skip to content

Instantly share code, notes, and snippets.

@zredlined
Last active February 22, 2024 14:54
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zredlined/fe0407c3ebedc75837ec707420c1f013 to your computer and use it in GitHub Desktop.
Save zredlined/fe0407c3ebedc75837ec707420c1f013 to your computer and use it in GitHub Desktop.
synthetic-timeseries-example.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "synthetic-timeseries-example.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/zredlined/fe0407c3ebedc75837ec707420c1f013/synthetic-timeseries-example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UTRxpSlaczHY"
},
"source": [
"# Create synthetic time series data\n",
"This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns."
]
},
{
"cell_type": "code",
"metadata": {
"id": "VEM6kjRsczHd"
},
"source": [
"%%capture\n",
"\n",
"!pip install -U gretel-client gretel-synthetics pandas"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZQ-TmAdwczHd"
},
"source": [
"# Load your Gretel API key. You can acquire this from the Gretel Console \n",
"# @ https://console.gretel.cloud\n",
"\n",
"from gretel_client import get_cloud_client\n",
"\n",
"client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n",
"client.install_packages()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "YMg9nX6SczHe"
},
"source": [
"# Load and preview dataset\n",
"import datetime\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"pd.options.plotting.backend = \"plotly\"\n",
"\n",
"day = 24 * 60 * 60\n",
"year = 365.2425 * day\n",
"\n",
"\n",
"def load_dataframe() -> pd.DataFrame:\n",
" df = pd.DataFrame(columns=['date', 'sin'])\n",
" df.date = pd.date_range(start='2018-01-01', end='2021-03-01', freq='D')\n",
" df.sin = 1 + np.sin(df.date.astype('int64') // 1e9 * (2 * np.pi / year))\n",
" df.sin = (df.sin * 100).round(2)\n",
" df.date = df.date.apply(lambda d: d.strftime('%Y-%m-%d'))\n",
" return df\n",
"\n",
"trend_col = 'sin'\n",
"train_df = load_dataframe()\n",
"train_df.set_index('date').plot()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TyyeuJBcX4KZ"
},
"source": [
"def trends_only(source_df: pd.DataFrame, trend_col: str) -> (float, pd.DataFrame):\n",
" \"\"\" Extract trends as training features vs total volume \"\"\"\n",
" df = source_df.copy()\n",
" start_val = df.at[0, trend_col]\n",
" df.sin = df[[trend_col]].diff()\n",
" df.at[0, trend_col] = 0.00\n",
" return start_val, df\n",
"\n",
"\n",
"def restore_daily(source_df: pd.DataFrame, start_val: float, trend_col: str):\n",
" \"\"\" Restore daily cumulative values from trend data \"\"\"\n",
" df = source_df.copy()\n",
" df.at[0, trend_col] = start_val\n",
" df[trend_col] = df[trend_col].cumsum()\n",
" df[trend_col] = df[trend_col].apply(pd.to_numeric, downcast='float', errors='coerce').round(2)\n",
" df.dropna(inplace=True)\n",
" return df\n",
"\n",
"# Extract trends from timeseries column to create training set\n",
"start_val, trends_df = trends_only(train_df, trend_col)\n",
"trends_df"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "O4-E_F0qczHe"
},
"source": [
"# Create the Gretel Synthtetics Training / Model Configuration\n",
"\n",
"from pathlib import Path\n",
"\n",
"checkpoint_dir = str(Path.cwd() / \"checkpoints-sin\")\n",
"\n",
"config_template = {\n",
" \"epochs\": 100,\n",
" \"early_stopping\": False,\n",
" \"vocab_size\": 20,\n",
" \"reset_states\": True, \n",
" \"checkpoint_dir\": checkpoint_dir,\n",
" \"overwrite\": True,\n",
"}"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Rw77l2Vg8nWl"
},
"source": [
"# Capture transient import errors in Google Colab\n",
"\n",
"try:\n",
" from gretel_helpers.synthetics import SyntheticDataBundle\n",
"except FileNotFoundError:\n",
" from gretel_helpers.synthetics import SyntheticDataBundle"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CCW-JaiNczHf"
},
"source": [
"# Create a Gretel Synthetic Data Bundle\n",
"\n",
"from gretel_helpers.synthetics import create_df, SyntheticDataBundle\n",
"\n",
"# Seed the model with the date\n",
"seed_fields = ['date']\n",
"\n",
"model = SyntheticDataBundle(\n",
" training_df=trends_df,\n",
" delimiter=\",\",\n",
" auto_validate=True, \n",
" synthetic_config=config_template,\n",
" header_prefix=seed_fields\n",
" )\n",
"\n",
"model.build()\n",
"model.train()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "F05AIKRcGTVz"
},
"source": [
"import json\n",
"\n",
"# Use the Date as a seed to the model\n",
"seed_data = json.loads(trends_df[seed_fields].to_json(orient=\"records\"))\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "sPM-gaU6czHf"
},
"source": [
"# Now generate the synthetic dataset\n",
"\n",
"\n",
"num_datasets = 5\n",
"\n",
"for dataset in range(num_datasets):\n",
" model.generate(max_invalid=1e5,\n",
" num_proc=1, # disable paralellism when using seed data\n",
" seed_fields=seed_data)\n",
" df = restore_daily(model.get_synthetic_df(), start_val, trend_col)\n",
" df.to_csv(f'synthetic-data-{dataset}.csv', index=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "HRNvm3TYGTV0"
},
"source": [
"import glob\n",
"import os\n",
"import pandas as pd\n",
"\n",
"pd.options.plotting.backend = \"plotly\"\n",
"\n",
"\n",
"def plot_trends(filenames):\n",
" combined_df = pd.DataFrame()\n",
" \n",
" for filename in filenames:\n",
" df = pd.read_csv(filename)\n",
" df['date'] = pd.to_datetime(df['date'])\n",
" df['label'] = filename\n",
" combined_df = pd.concat([combined_df, df])\n",
" \n",
" combined_df.set_index('date', inplace=True)\n",
" combined_df.plot(color=\"label\").show()\n",
"\n",
"train_df.to_csv('training-data.csv', index=False)\n",
"filenames = ['training-data.csv']\n",
"for filename in glob.iglob('synthetic-data*.csv'):\n",
" if os.path.isfile(filename):\n",
" filenames.append(filename)\n",
" \n",
"plot_trends(filenames)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "bKFAm8fOGTV5"
},
"source": [
"# "
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment