Skip to content

Instantly share code, notes, and snippets.

@dleybz
Created June 16, 2023 02:02
Show Gist options
  • Save dleybz/545fb9beae61b07b74ee0daaf45dcc75 to your computer and use it in GitHub Desktop.
Save dleybz/545fb9beae61b07b74ee0daaf45dcc75 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear Trend"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"linear_trend = np.arange(0, 3650)\n",
"f0 = linear_trend + 10000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"fig = px.line(f0,\n",
" title='Linear Trend')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from ydata.sdk.dataset import get_dataset\n",
"from ydata.sdk.synthesizers import TimeSeriesSynthesizer\n",
"import pandas as pd\n",
"\n",
"os.environ['YDATA_TOKEN'] = 'your_token_here'\n",
"\n",
"f0_df = pd.DataFrame(f0, columns=['f0'])\n",
"f0_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f0_df, sortbykey='index')\n",
"f0_synth = synth.sample(n_entities=1)\n",
"\n",
"f0_synth_v_real = pd.DataFrame({'Synthetic': f0_synth['f0'], 'Actual': f0})\n",
"\n",
"fig = px.line(f0_synth_v_real, y=f0_synth_v_real.columns,\n",
" title='Linear Trend, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add Yearly Seasonality"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"distance_from_day182 = (linear_trend % 365) - 182\n",
"normalized_dfd182 = distance_from_day182 * np.pi / 182\n",
"\n",
"fig = px.line(normalized_dfd182,\n",
" title='Distance from Day 182, normalized')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"annual_seasonality = np.sin(normalized_dfd182)\n",
"\n",
"fig = px.line(annual_seasonality,\n",
" title='Annual Seasonality')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f1 = f0 + annual_seasonality*1000\n",
"\n",
"fig = px.line(f1,\n",
" title='Linear Trend + Annual Seasonality')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f1 = f0 + annual_seasonality*linear_trend/10\n",
"\n",
"fig = px.line(f1,\n",
" title='Linear Trend + Annual Seasonality×Linear Trend')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f1_df = pd.DataFrame(f1, columns=['f1'])\n",
"f1_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f1_df, sortbykey='index')\n",
"f1_synth = synth.sample(n_entities=1)\n",
"\n",
"f1_synth_v_real = pd.DataFrame({'Synthetic': f1_synth['f1'], 'Actual': f1})\n",
"\n",
"fig = px.line(f1_synth_v_real, y=f1_synth_v_real.columns,\n",
" title='Linear Trend + Annual Seasonality×Linear Trend, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add Weekly Seasonality"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"day_of_week = (linear_trend % 7)\n",
"\n",
"weekly_seasonality_dict = {\n",
" 0: 0,\n",
" 1: 300,\n",
" 2: 600,\n",
" 3: 500,\n",
" 4: 700,\n",
" 5: 400,\n",
" 6: 0\n",
"}\n",
"\n",
"weekly_seasonality = [weekly_seasonality_dict[d] for d in day_of_week]\n",
"weekly_seasonality = np.array(weekly_seasonality)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f2 = f1 + weekly_seasonality*linear_trend/10000\n",
"\n",
"fig = px.line(f2,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f2_df = pd.DataFrame(f2, columns=['f2'])\n",
"f2_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f2_df, sortbykey='index')\n",
"f2_synth = synth.sample(n_entities=1)\n",
"\n",
"f2_synth_v_real = pd.DataFrame({'Synthetic': f2_synth['f2'], 'Actual': f2})\n",
"\n",
"fig = px.line(f2_synth_v_real, y=f2_synth_v_real.columns,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f2_synth_v_real, y=f2_synth_v_real.columns,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add Simple Noise "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(seed=1)\n",
"noise = np.random.normal(0, 100, 3650)\n",
"\n",
"f3 = f2 + noise\n",
"\n",
"fig = px.line(f3,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise')\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from statsmodels.tsa.seasonal import MSTL\n",
"import matplotlib.pyplot as plt\n",
"\n",
"res = MSTL(f3, periods=(7, 365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f3_df = pd.DataFrame(f3, columns=['f3'])\n",
"f3_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f3_df, sortbykey='index')\n",
"f3_synth = synth.sample(n_entities=1)\n",
"\n",
"f3_synth_v_real = pd.DataFrame({'Synthetic': f3_synth['f3'], 'Actual': f3})\n",
"\n",
"fig = px.line(f3_synth_v_real, y=f3_synth_v_real.columns,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f3_synth_v_real, y=f3_synth_v_real.columns,\n",
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = MSTL(f3_synth['f3'], periods=(7, 365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add Correlated Noise"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"annual_noise = np.random.normal(1, 1, 3650)\n",
"weekly_noise = np.random.normal(1, 1, 3650)\n",
"\n",
"f4 = linear_trend + annual_noise*annual_seasonality*linear_trend/10 + weekly_noise*weekly_seasonality*linear_trend/10000 + 10000\n",
"\n",
"fig = px.line(f4,\n",
" title='Complex, Noisy Time Series')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = MSTL(f4, periods=(7, 365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f4_df = pd.DataFrame(f4, columns=['f4'])\n",
"f4_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f4_df, sortbykey='index')\n",
"f4_synth = synth.sample(n_entities=1)\n",
"\n",
"f4_synth_v_real = pd.DataFrame({'Synthetic': f4_synth['f4'], 'Actual': f4})\n",
"\n",
"fig = px.line(f4_synth_v_real, y=f4_synth_v_real.columns,\n",
" title='Complex and Noisy, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = MSTL(f4_synth['f4'], periods=(7, 365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add exogenous variables"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import bernoulli\n",
"\n",
"abs_dfd182 = abs(distance_from_day182)\n",
"sunny = bernoulli(abs_dfd182/182).rvs(3650)\n",
"\n",
"fig = px.scatter(sunny,\n",
" title='Sunny')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"salary = [60000]*365 + [70000]*730 + [85000]*365 + [90000]*1460 + [100000]*730\n",
"salary = np.array(salary)\n",
"\n",
"fig = px.line(salary,\n",
" title='Salary')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f5 = linear_trend*10 + annual_seasonality*1000 + sunny*500 + salary/10\n",
"\n",
"fig = px.line(f5,\n",
" title='Complex Time Series')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f5, color = sunny,\n",
" title='Complex Time Series, by Sunny Days')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f5, color = salary,\n",
" title='Complex Time Series, by Salary')\n",
"fig.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add noise"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6 = linear_trend*10 + annual_seasonality*annual_noise*linear_trend + sunny*100 + salary/5\n",
"\n",
"fig = px.line(f6,\n",
" title='Complex, Noisy Time Series')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f6, color = sunny,\n",
" title='Complex Time Series, by Sunny Day')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6_sunny = f6[sunny == 1]\n",
"f6_sunny_mean = round(np.mean(f6_sunny))\n",
"\n",
"f6_not_sunny = f6[sunny == 0]\n",
"f6_not_sunny_mean = round(np.mean(f6_not_sunny))\n",
"\n",
"difference = round(f6_sunny_mean - f6_not_sunny_mean)\n",
"\n",
"f\"The mean of the complex, noisy function when sunny is {f6_sunny_mean} and the mean of f6 when not sunny is {f6_not_sunny_mean}. The difference is {difference}.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f6, color = salary,\n",
" title='Complex Time Series, by Salary')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6_365 = f6.reshape(10, 365)\n",
"yearly_avg = f6_365.mean(axis=1)\n",
"yearly_diff = yearly_avg[1:] - yearly_avg[:-1]\n",
"\n",
"salary_365 = salary.reshape(10, 365)\n",
"salary_avg = salary_365.max(axis=1)\n",
"salary_diff = salary_avg[1:] - salary_avg[:-1]\n",
"\n",
"salary_f6_df = pd.DataFrame({'Salary': salary_diff, 'f6': yearly_diff})\n",
"\n",
"fig = px.scatter(salary_f6_df, y = salary_f6_df.columns,\n",
" title='Change in Yearly Spend vs Change in Yearly Salary')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = MSTL(f6, periods=(365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6_df = pd.DataFrame({'f6': f6, 'sunny': sunny, 'salary': salary})\n",
"f6_df.reset_index(inplace=True)\n",
"synth = TimeSeriesSynthesizer()\n",
"synth.fit(f6_df, sortbykey='index')\n",
"f6_synth = synth.sample(n_entities=1)\n",
"\n",
"f6_synth_v_real = pd.DataFrame({'Synthetic': f6_synth['f6'], 'Actual': f6})\n",
"\n",
"fig = px.line(f6_synth_v_real, y=f6_synth_v_real.columns,\n",
" title='Complex and Noisy, Actual vs Synthetic')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f6_synth['f6'], color = sunny,\n",
" title='Synthetic Data, by Sunny Day')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6_sunny = f6_synth['f6'][sunny == 1]\n",
"f6_sunny_mean = round(np.mean(f6_sunny))\n",
"\n",
"f6_not_sunny = f6_synth['f6'][sunny == 0]\n",
"f6_not_sunny_mean = round(np.mean(f6_not_sunny))\n",
"\n",
"difference = round(f6_sunny_mean - f6_not_sunny_mean)\n",
"\n",
"f\"The mean of the synthetic data when sunny is {f6_sunny_mean} and the mean of f6 when not sunny is {f6_not_sunny_mean}. The difference is {difference}.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.line(f6_synth['f6'], color = salary,\n",
" title='Synthetic Data, by Salary')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f6_synth_array = np.array(f6_synth['f6'])\n",
"f6_365 = f6_synth_array.reshape(10, 365)\n",
"yearly_avg = f6_365.mean(axis=1)\n",
"yearly_diff = yearly_avg[1:] - yearly_avg[:-1]\n",
"\n",
"salary_365 = salary.reshape(10, 365)\n",
"salary_avg = salary_365.max(axis=1)\n",
"salary_diff = salary_avg[1:] - salary_avg[:-1]\n",
"\n",
"salary_f6_df = pd.DataFrame({'Salary': salary_diff, 'f6': yearly_diff})\n",
"\n",
"fig = px.scatter(salary_f6_df, y = salary_f6_df.columns,\n",
" title='Change in Synthetic Yearly Spend vs Change in Yearly Salary')\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = MSTL(f6_synth['f6'], periods=(365)).fit()\n",
"\n",
"res.plot()\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment