the-moliver/gmm_target.ipynb

## gmm_target.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "798d5d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import rankdata\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.mixture import BayesianGaussianMixture\n",
    "from xgboost import XGBRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f8637ccc",
   "metadata": {},
   "outputs": [],
   "source": [
    "training_data =  pd.read_parquet(\"numerai_training_data.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "id": "c919eb30",
   "metadata": {},
   "outputs": [],
   "source": [
    "val_data =  pd.read_parquet(\"numerai_validation_data.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c57156b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "rlm = Ridge(fit_intercept=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fadbc9ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "era_list = training_data.era.unique()\n",
    "coefs = []\n",
    "for ii, era in enumerate(era_list):\n",
    "    # get features and target from data and center\n",
    "    features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
    "    target = training_data[training_data.era == era]['target'] - .5\n",
    "    # fit ridge regression model for each era\n",
    "    rlm.fit(features, target)\n",
    "    coefs.append(rlm.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c87696da",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "coefs = np.vstack(coefs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "id": "04b505f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialization 0\n",
      "Initialization converged: True\n"
     ]
    }
   ],
   "source": [
    "# For generating lots of fake data, you could just run all of below in a loop\n",
    "\n",
    "# choose a random number of components for a Gaussian Mixture Model\n",
    "n_components = np.random.choice(range(2,10))\n",
    "\n",
    "# fit a Gaussian Mixture Model of linear regression weights\n",
    "gm = BayesianGaussianMixture(n_components=n_components, verbose=True)\n",
    "gm.fit(coefs)\n",
    "\n",
    "# make probability of sampling each component equal to better balance rare regimes\n",
    "gm.weights_[:] = 1/n_components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "id": "239b279b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0573\r"
     ]
    }
   ],
   "source": [
    "fake_target = []\n",
    "real_target = []\n",
    "all_features = []\n",
    "bins = [0, 0.05, 0.25, 0.75, 0.95, 1]\n",
    "\n",
    "for era in era_list[np.random.choice(4)::4]:\n",
    "    print(era, end='\\r')\n",
    "    features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
    "    real_target.append(training_data[training_data.era == era]['target'])\n",
    "    # sample a set of weights from GMM\n",
    "    beta, _ = gm.sample(1)\n",
    "    \n",
    "    # create fake continuous target\n",
    "    fake_targ = features @ beta[0]\n",
    "    \n",
    "    # bin fake target like real target\n",
    "    fake_targ = (rankdata(fake_targ) - .5)/len(fake_targ)\n",
    "    fake_targ = (np.digitize(fake_targ, bins) - 1)/4\n",
    "    \n",
    "    fake_target.append(fake_targ)\n",
    "    all_features.append(features)\n",
    "    \n",
    "all_features = np.concatenate(all_features)\n",
    "fake_target = np.concatenate(fake_target)\n",
    "real_target = np.concatenate(real_target)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "id": "573e329c",
   "metadata": {},
   "outputs": [],
   "source": [
    "xgbr = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "id": "659e0652",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
       "             importance_type='gain', interaction_constraints='',\n",
       "             learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
       "             min_child_weight=1, missing=nan, monotone_constraints='()',\n",
       "             n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
       "             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
       "             tree_method='exact', validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgbr.fit(all_features, fake_target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "id": "f35a1f3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pred = xgbr.predict(val_data.filter(like='feature').values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "id": "2a247340",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "val corr trained on fake target: 0.013349771549412833\n"
     ]
    }
   ],
   "source": [
    "c1 = np.corrcoef(pred, val_data['target'])[0,1]\n",
    "print(f'val corr trained on fake target: {c1}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "id": "8803f5e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "xgbr2 = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "id": "7845e9e7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
       "             importance_type='gain', interaction_constraints='',\n",
       "             learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
       "             min_child_weight=1, missing=nan, monotone_constraints='()',\n",
       "             n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
       "             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
       "             tree_method='exact', validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 200,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgbr2.fit(all_features, real_target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "id": "afc525ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "pred2 = xgbr2.predict(val_data.filter(like='feature').values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "id": "147cbf25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "val corr trained on real target: 0.010239539249306307\n"
     ]
    }
   ],
   "source": [
    "c2 = np.corrcoef(pred2, val_data['target'])[0,1]\n",
    "print(f'val corr trained on real target: {c2}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fdb6f431",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 168,
	"id": "798d5d25",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"from scipy.stats import rankdata\n",
	"from sklearn.linear_model import Ridge\n",
	"from sklearn.mixture import BayesianGaussianMixture\n",
	"from xgboost import XGBRegressor"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "f8637ccc",
	"metadata": {},
	"outputs": [],
	"source": [
	"training_data = pd.read_parquet(\"numerai_training_data.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 189,
	"id": "c919eb30",
	"metadata": {},
	"outputs": [],
	"source": [
	"val_data = pd.read_parquet(\"numerai_validation_data.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "c57156b7",
	"metadata": {},
	"outputs": [],
	"source": [
	"rlm = Ridge(fit_intercept=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "fadbc9ea",
	"metadata": {},
	"outputs": [],
	"source": [
	"era_list = training_data.era.unique()\n",
	"coefs = []\n",
	"for ii, era in enumerate(era_list):\n",
	" # get features and target from data and center\n",
	" features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
	" target = training_data[training_data.era == era]['target'] - .5\n",
	" # fit ridge regression model for each era\n",
	" rlm.fit(features, target)\n",
	" coefs.append(rlm.coef_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"id": "c87696da",
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"coefs = np.vstack(coefs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 177,
	"id": "04b505f4",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Initialization 0\n",
	"Initialization converged: True\n"
	]
	}
	],
	"source": [
	"# For generating lots of fake data, you could just run all of below in a loop\n",
	"\n",
	"# choose a random number of components for a Gaussian Mixture Model\n",
	"n_components = np.random.choice(range(2,10))\n",
	"\n",
	"# fit a Gaussian Mixture Model of linear regression weights\n",
	"gm = BayesianGaussianMixture(n_components=n_components, verbose=True)\n",
	"gm.fit(coefs)\n",
	"\n",
	"# make probability of sampling each component equal to better balance rare regimes\n",
	"gm.weights_[:] = 1/n_components"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 194,
	"id": "239b279b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0573\r"
	]
	}
	],
	"source": [
	"fake_target = []\n",
	"real_target = []\n",
	"all_features = []\n",
	"bins = [0, 0.05, 0.25, 0.75, 0.95, 1]\n",
	"\n",
	"for era in era_list[np.random.choice(4)::4]:\n",
	" print(era, end='\\r')\n",
	" features = training_data[training_data.era == era].filter(like='feature').values - .5\n",
	" real_target.append(training_data[training_data.era == era]['target'])\n",
	" # sample a set of weights from GMM\n",
	" beta, _ = gm.sample(1)\n",
	" \n",
	" # create fake continuous target\n",
	" fake_targ = features @ beta[0]\n",
	" \n",
	" # bin fake target like real target\n",
	" fake_targ = (rankdata(fake_targ) - .5)/len(fake_targ)\n",
	" fake_targ = (np.digitize(fake_targ, bins) - 1)/4\n",
	" \n",
	" fake_target.append(fake_targ)\n",
	" all_features.append(features)\n",
	" \n",
	"all_features = np.concatenate(all_features)\n",
	"fake_target = np.concatenate(fake_target)\n",
	"real_target = np.concatenate(real_target)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 195,
	"id": "573e329c",
	"metadata": {},
	"outputs": [],
	"source": [
	"xgbr = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 196,
	"id": "659e0652",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
	" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
	" importance_type='gain', interaction_constraints='',\n",
	" learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
	" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
	" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
	" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
	" tree_method='exact', validate_parameters=1, verbosity=None)"
	]
	},
	"execution_count": 196,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"xgbr.fit(all_features, fake_target)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 197,
	"id": "f35a1f3d",
	"metadata": {},
	"outputs": [],
	"source": [
	"pred = xgbr.predict(val_data.filter(like='feature').values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 216,
	"id": "2a247340",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"val corr trained on fake target: 0.013349771549412833\n"
	]
	}
	],
	"source": [
	"c1 = np.corrcoef(pred, val_data['target'])[0,1]\n",
	"print(f'val corr trained on fake target: {c1}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 199,
	"id": "8803f5e6",
	"metadata": {},
	"outputs": [],
	"source": [
	"xgbr2 = XGBRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5, n_estimators=100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 200,
	"id": "7845e9e7",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
	" colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,\n",
	" importance_type='gain', interaction_constraints='',\n",
	" learning_rate=0.01, max_delta_step=0, max_depth=5,\n",
	" min_child_weight=1, missing=nan, monotone_constraints='()',\n",
	" n_estimators=100, n_jobs=32, num_parallel_tree=1, random_state=0,\n",
	" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
	" tree_method='exact', validate_parameters=1, verbosity=None)"
	]
	},
	"execution_count": 200,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"xgbr2.fit(all_features, real_target)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 201,
	"id": "afc525ce",
	"metadata": {},
	"outputs": [],
	"source": [
	"pred2 = xgbr2.predict(val_data.filter(like='feature').values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 215,
	"id": "147cbf25",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"val corr trained on real target: 0.010239539249306307\n"
	]
	}
	],
	"source": [
	"c2 = np.corrcoef(pred2, val_data['target'])[0,1]\n",
	"print(f'val corr trained on real target: {c2}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "fdb6f431",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}