Created
June 16, 2023 02:02
-
-
Save dleybz/545fb9beae61b07b74ee0daaf45dcc75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Linear Trend" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"linear_trend = np.arange(0, 3650)\n", | |
"f0 = linear_trend + 10000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import plotly.express as px\n", | |
"\n", | |
"fig = px.line(f0,\n", | |
" title='Linear Trend')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"\n", | |
"from ydata.sdk.dataset import get_dataset\n", | |
"from ydata.sdk.synthesizers import TimeSeriesSynthesizer\n", | |
"import pandas as pd\n", | |
"\n", | |
"os.environ['YDATA_TOKEN'] = 'your_token_here'\n", | |
"\n", | |
"f0_df = pd.DataFrame(f0, columns=['f0'])\n", | |
"f0_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f0_df, sortbykey='index')\n", | |
"f0_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f0_synth_v_real = pd.DataFrame({'Synthetic': f0_synth['f0'], 'Actual': f0})\n", | |
"\n", | |
"fig = px.line(f0_synth_v_real, y=f0_synth_v_real.columns,\n", | |
" title='Linear Trend, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add Yearly Seasonality" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"distance_from_day182 = (linear_trend % 365) - 182\n", | |
"normalized_dfd182 = distance_from_day182 * np.pi / 182\n", | |
"\n", | |
"fig = px.line(normalized_dfd182,\n", | |
" title='Distance from Day 182, normalized')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"annual_seasonality = np.sin(normalized_dfd182)\n", | |
"\n", | |
"fig = px.line(annual_seasonality,\n", | |
" title='Annual Seasonality')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f1 = f0 + annual_seasonality*1000\n", | |
"\n", | |
"fig = px.line(f1,\n", | |
" title='Linear Trend + Annual Seasonality')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f1 = f0 + annual_seasonality*linear_trend/10\n", | |
"\n", | |
"fig = px.line(f1,\n", | |
" title='Linear Trend + Annual Seasonality×Linear Trend')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f1_df = pd.DataFrame(f1, columns=['f1'])\n", | |
"f1_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f1_df, sortbykey='index')\n", | |
"f1_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f1_synth_v_real = pd.DataFrame({'Synthetic': f1_synth['f1'], 'Actual': f1})\n", | |
"\n", | |
"fig = px.line(f1_synth_v_real, y=f1_synth_v_real.columns,\n", | |
" title='Linear Trend + Annual Seasonality×Linear Trend, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add Weekly Seasonality" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"day_of_week = (linear_trend % 7)\n", | |
"\n", | |
"weekly_seasonality_dict = {\n", | |
" 0: 0,\n", | |
" 1: 300,\n", | |
" 2: 600,\n", | |
" 3: 500,\n", | |
" 4: 700,\n", | |
" 5: 400,\n", | |
" 6: 0\n", | |
"}\n", | |
"\n", | |
"weekly_seasonality = [weekly_seasonality_dict[d] for d in day_of_week]\n", | |
"weekly_seasonality = np.array(weekly_seasonality)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f2 = f1 + weekly_seasonality*linear_trend/10000\n", | |
"\n", | |
"fig = px.line(f2,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f2_df = pd.DataFrame(f2, columns=['f2'])\n", | |
"f2_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f2_df, sortbykey='index')\n", | |
"f2_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f2_synth_v_real = pd.DataFrame({'Synthetic': f2_synth['f2'], 'Actual': f2})\n", | |
"\n", | |
"fig = px.line(f2_synth_v_real, y=f2_synth_v_real.columns,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f2_synth_v_real, y=f2_synth_v_real.columns,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add Simple Noise " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.random.seed(seed=1)\n", | |
"noise = np.random.normal(0, 100, 3650)\n", | |
"\n", | |
"f3 = f2 + noise\n", | |
"\n", | |
"fig = px.line(f3,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise')\n", | |
"fig.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from statsmodels.tsa.seasonal import MSTL\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"res = MSTL(f3, periods=(7, 365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f3_df = pd.DataFrame(f3, columns=['f3'])\n", | |
"f3_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f3_df, sortbykey='index')\n", | |
"f3_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f3_synth_v_real = pd.DataFrame({'Synthetic': f3_synth['f3'], 'Actual': f3})\n", | |
"\n", | |
"fig = px.line(f3_synth_v_real, y=f3_synth_v_real.columns,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f3_synth_v_real, y=f3_synth_v_real.columns,\n", | |
" title='Linear Trend + Annual Seasonality + Weekly Seasonality + Noise, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = MSTL(f3_synth['f3'], periods=(7, 365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add Correlated Noise" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"annual_noise = np.random.normal(1, 1, 3650)\n", | |
"weekly_noise = np.random.normal(1, 1, 3650)\n", | |
"\n", | |
"f4 = linear_trend + annual_noise*annual_seasonality*linear_trend/10 + weekly_noise*weekly_seasonality*linear_trend/10000 + 10000\n", | |
"\n", | |
"fig = px.line(f4,\n", | |
" title='Complex, Noisy Time Series')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = MSTL(f4, periods=(7, 365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f4_df = pd.DataFrame(f4, columns=['f4'])\n", | |
"f4_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f4_df, sortbykey='index')\n", | |
"f4_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f4_synth_v_real = pd.DataFrame({'Synthetic': f4_synth['f4'], 'Actual': f4})\n", | |
"\n", | |
"fig = px.line(f4_synth_v_real, y=f4_synth_v_real.columns,\n", | |
" title='Complex and Noisy, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = MSTL(f4_synth['f4'], periods=(7, 365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add exogenous variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from scipy.stats import bernoulli\n", | |
"\n", | |
"abs_dfd182 = abs(distance_from_day182)\n", | |
"sunny = bernoulli(abs_dfd182/182).rvs(3650)\n", | |
"\n", | |
"fig = px.scatter(sunny,\n", | |
" title='Sunny')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"salary = [60000]*365 + [70000]*730 + [85000]*365 + [90000]*1460 + [100000]*730\n", | |
"salary = np.array(salary)\n", | |
"\n", | |
"fig = px.line(salary,\n", | |
" title='Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f5 = linear_trend*10 + annual_seasonality*1000 + sunny*500 + salary/10\n", | |
"\n", | |
"fig = px.line(f5,\n", | |
" title='Complex Time Series')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f5, color = sunny,\n", | |
" title='Complex Time Series, by Sunny Days')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f5, color = salary,\n", | |
" title='Complex Time Series, by Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Add noise" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6 = linear_trend*10 + annual_seasonality*annual_noise*linear_trend + sunny*100 + salary/5\n", | |
"\n", | |
"fig = px.line(f6,\n", | |
" title='Complex, Noisy Time Series')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f6, color = sunny,\n", | |
" title='Complex Time Series, by Sunny Day')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6_sunny = f6[sunny == 1]\n", | |
"f6_sunny_mean = round(np.mean(f6_sunny))\n", | |
"\n", | |
"f6_not_sunny = f6[sunny == 0]\n", | |
"f6_not_sunny_mean = round(np.mean(f6_not_sunny))\n", | |
"\n", | |
"difference = round(f6_sunny_mean - f6_not_sunny_mean)\n", | |
"\n", | |
"f\"The mean of the complex, noisy function when sunny is {f6_sunny_mean} and the mean of f6 when not sunny is {f6_not_sunny_mean}. The difference is {difference}.\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f6, color = salary,\n", | |
" title='Complex Time Series, by Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6_365 = f6.reshape(10, 365)\n", | |
"yearly_avg = f6_365.mean(axis=1)\n", | |
"yearly_diff = yearly_avg[1:] - yearly_avg[:-1]\n", | |
"\n", | |
"salary_365 = salary.reshape(10, 365)\n", | |
"salary_avg = salary_365.max(axis=1)\n", | |
"salary_diff = salary_avg[1:] - salary_avg[:-1]\n", | |
"\n", | |
"salary_f6_df = pd.DataFrame({'Salary': salary_diff, 'f6': yearly_diff})\n", | |
"\n", | |
"fig = px.scatter(salary_f6_df, y = salary_f6_df.columns,\n", | |
" title='Change in Yearly Spend vs Change in Yearly Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = MSTL(f6, periods=(365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6_df = pd.DataFrame({'f6': f6, 'sunny': sunny, 'salary': salary})\n", | |
"f6_df.reset_index(inplace=True)\n", | |
"synth = TimeSeriesSynthesizer()\n", | |
"synth.fit(f6_df, sortbykey='index')\n", | |
"f6_synth = synth.sample(n_entities=1)\n", | |
"\n", | |
"f6_synth_v_real = pd.DataFrame({'Synthetic': f6_synth['f6'], 'Actual': f6})\n", | |
"\n", | |
"fig = px.line(f6_synth_v_real, y=f6_synth_v_real.columns,\n", | |
" title='Complex and Noisy, Actual vs Synthetic')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f6_synth['f6'], color = sunny,\n", | |
" title='Synthetic Data, by Sunny Day')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6_sunny = f6_synth['f6'][sunny == 1]\n", | |
"f6_sunny_mean = round(np.mean(f6_sunny))\n", | |
"\n", | |
"f6_not_sunny = f6_synth['f6'][sunny == 0]\n", | |
"f6_not_sunny_mean = round(np.mean(f6_not_sunny))\n", | |
"\n", | |
"difference = round(f6_sunny_mean - f6_not_sunny_mean)\n", | |
"\n", | |
"f\"The mean of the synthetic data when sunny is {f6_sunny_mean} and the mean of f6 when not sunny is {f6_not_sunny_mean}. The difference is {difference}.\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = px.line(f6_synth['f6'], color = salary,\n", | |
" title='Synthetic Data, by Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"f6_synth_array = np.array(f6_synth['f6'])\n", | |
"f6_365 = f6_synth_array.reshape(10, 365)\n", | |
"yearly_avg = f6_365.mean(axis=1)\n", | |
"yearly_diff = yearly_avg[1:] - yearly_avg[:-1]\n", | |
"\n", | |
"salary_365 = salary.reshape(10, 365)\n", | |
"salary_avg = salary_365.max(axis=1)\n", | |
"salary_diff = salary_avg[1:] - salary_avg[:-1]\n", | |
"\n", | |
"salary_f6_df = pd.DataFrame({'Salary': salary_diff, 'f6': yearly_diff})\n", | |
"\n", | |
"fig = px.scatter(salary_f6_df, y = salary_f6_df.columns,\n", | |
" title='Change in Synthetic Yearly Spend vs Change in Yearly Salary')\n", | |
"fig.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = MSTL(f6_synth['f6'], periods=(365)).fit()\n", | |
"\n", | |
"res.plot()\n", | |
"plt.tight_layout()\n", | |
"plt.show()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": ".venv", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.16" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment