Skip to content

Instantly share code, notes, and snippets.

@amirziai
Created February 27, 2015 21:40
Show Gist options
  • Save amirziai/31cc3277c3812faffed0 to your computer and use it in GitHub Desktop.
Save amirziai/31cc3277c3812faffed0 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:dbe7091a498275c9754ee330eaf46671194336228630999420591891e5b780f9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Customizable ticker dataset generator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This script will generate a dataset based on the tickers you are interested in\n",
"<br>\n",
"The data is pulled directly from Yahoo! Finance API and variables are generated\n",
"<br>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Enter ticker, industry and broad index"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Example is HPQ, MTK (Morgan Stanley Tech SPDRS), and NASDAQ (^IXIC)\n",
"<br>\n",
"\"train_size\" is percentage of data you want to be used for training, the rest will be used for testing"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"start_date = '2009/1/1'\n",
"ticker_name = 'CSCO'\n",
"industry_name = 'MTK'\n",
"index_name = '^IXIC'\n",
"output_filename = ticker_name + '_' + industry_name + '_' + index_name\n",
"\n",
"train_size = 9./10"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 219
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ticker = pandas.io.data.DataReader(ticker_name,'yahoo', start = start_date)\n",
"industry = mtk = pandas.io.data.DataReader(industry_name,'yahoo', start = start_date)\n",
"index = pandas.io.data.DataReader(index_name,'yahoo', start = start_date)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 220
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Paramters for creating variables"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"days = [1, 5, 10, 15, 20, 30]\n",
"forward_day = 11\n",
"short_term_ma = [5, 12, 26]\n",
"long_term_ma = [12, 26, 50]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 221
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"tickerx = generate_df(ticker, ticker_name)\n",
"industryx = generate_df(mtk, industry_name, False)\n",
"indexx = generate_df(nas, index_name, False)\n",
"final = tickerx.join(industryx.join(indexx))\n",
"final.dropna(how='any', inplace=True)\n",
"final.to_csv(output_filename + '.csv')\n",
"create_train_test(final, train_size, output_filename)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 222
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"final.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CSCO_DSA</th>\n",
" <th>CSCO_PPO_1</th>\n",
" <th>CSCO_PVO_1</th>\n",
" <th>CSCO_PPO_2</th>\n",
" <th>CSCO_PVO_2</th>\n",
" <th>CSCO_PPO_3</th>\n",
" <th>CSCO_PVO_3</th>\n",
" <th>CSCO_VolPct_1d</th>\n",
" <th>CSCO_AdjClosePct_1d</th>\n",
" <th>CSCO_VolPct_5d</th>\n",
" <th>...</th>\n",
" <th>^IXIC_VolPct_5d</th>\n",
" <th>^IXIC_AdjClosePct_5d</th>\n",
" <th>^IXIC_VolPct_10d</th>\n",
" <th>^IXIC_AdjClosePct_10d</th>\n",
" <th>^IXIC_VolPct_15d</th>\n",
" <th>^IXIC_AdjClosePct_15d</th>\n",
" <th>^IXIC_VolPct_20d</th>\n",
" <th>^IXIC_AdjClosePct_20d</th>\n",
" <th>^IXIC_VolPct_30d</th>\n",
" <th>^IXIC_AdjClosePct_30d</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2009-03-16</th>\n",
" <td> 0.033566</td>\n",
" <td> 0.032162</td>\n",
" <td>-0.188036</td>\n",
" <td>-0.028587</td>\n",
" <td> 0.136516</td>\n",
" <td>-0.031996</td>\n",
" <td>-0.012073</td>\n",
" <td> 0.165732</td>\n",
" <td>-0.004267</td>\n",
" <td>-0.357839</td>\n",
" <td>...</td>\n",
" <td> 0.030597</td>\n",
" <td> 0.106713</td>\n",
" <td> 0.032635</td>\n",
" <td> 0.061360</td>\n",
" <td> 0.061545</td>\n",
" <td> 0.011746</td>\n",
" <td> 0.054904</td>\n",
" <td>-0.084947</td>\n",
" <td> 0.021839</td>\n",
" <td>-0.049038</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-03-17</th>\n",
" <td> 0.036896</td>\n",
" <td> 0.043272</td>\n",
" <td>-0.223184</td>\n",
" <td>-0.017766</td>\n",
" <td> 0.109283</td>\n",
" <td>-0.033181</td>\n",
" <td>-0.022775</td>\n",
" <td> 0.036311</td>\n",
" <td> 0.045000</td>\n",
" <td>-0.273935</td>\n",
" <td>...</td>\n",
" <td>-0.121412</td>\n",
" <td> 0.076442</td>\n",
" <td>-0.113580</td>\n",
" <td> 0.106812</td>\n",
" <td>-0.113876</td>\n",
" <td> 0.014065</td>\n",
" <td>-0.112248</td>\n",
" <td>-0.005814</td>\n",
" <td> 0.043355</td>\n",
" <td>-0.021627</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-03-18</th>\n",
" <td> 0.057392</td>\n",
" <td> 0.050439</td>\n",
" <td>-0.157317</td>\n",
" <td>-0.004887</td>\n",
" <td> 0.110437</td>\n",
" <td>-0.033272</td>\n",
" <td>-0.005328</td>\n",
" <td> 0.632135</td>\n",
" <td> 0.022556</td>\n",
" <td> 0.545722</td>\n",
" <td>...</td>\n",
" <td> 0.274729</td>\n",
" <td> 0.087180</td>\n",
" <td> 0.199451</td>\n",
" <td> 0.101556</td>\n",
" <td> 0.178892</td>\n",
" <td> 0.046154</td>\n",
" <td> 0.362239</td>\n",
" <td> 0.015838</td>\n",
" <td> 0.348861</td>\n",
" <td>-0.016540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-03-19</th>\n",
" <td> 0.037170</td>\n",
" <td> 0.049266</td>\n",
" <td>-0.129913</td>\n",
" <td> 0.004805</td>\n",
" <td> 0.111483</td>\n",
" <td>-0.030913</td>\n",
" <td>-0.012782</td>\n",
" <td>-0.333437</td>\n",
" <td>-0.016711</td>\n",
" <td> 0.142328</td>\n",
" <td>...</td>\n",
" <td>-0.028946</td>\n",
" <td> 0.040236</td>\n",
" <td>-0.001002</td>\n",
" <td> 0.141498</td>\n",
" <td> 0.009348</td>\n",
" <td> 0.066124</td>\n",
" <td> 0.167007</td>\n",
" <td> 0.028181</td>\n",
" <td> 0.057559</td>\n",
" <td>-0.020838</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2009-03-20</th>\n",
" <td> 0.050092</td>\n",
" <td> 0.050677</td>\n",
" <td>-0.056182</td>\n",
" <td> 0.009127</td>\n",
" <td> 0.092569</td>\n",
" <td>-0.029811</td>\n",
" <td>-0.005359</td>\n",
" <td> 0.166015</td>\n",
" <td>-0.019714</td>\n",
" <td> 0.532465</td>\n",
" <td>...</td>\n",
" <td> 0.184109</td>\n",
" <td> 0.018002</td>\n",
" <td>-0.020194</td>\n",
" <td> 0.126305</td>\n",
" <td> 0.000497</td>\n",
" <td> 0.057648</td>\n",
" <td>-0.052085</td>\n",
" <td> 0.011129</td>\n",
" <td>-0.046693</td>\n",
" <td>-0.057540</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 58 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 223,
"text": [
" CSCO_DSA CSCO_PPO_1 CSCO_PVO_1 CSCO_PPO_2 CSCO_PVO_2 \\\n",
"Date \n",
"2009-03-16 0.033566 0.032162 -0.188036 -0.028587 0.136516 \n",
"2009-03-17 0.036896 0.043272 -0.223184 -0.017766 0.109283 \n",
"2009-03-18 0.057392 0.050439 -0.157317 -0.004887 0.110437 \n",
"2009-03-19 0.037170 0.049266 -0.129913 0.004805 0.111483 \n",
"2009-03-20 0.050092 0.050677 -0.056182 0.009127 0.092569 \n",
"\n",
" CSCO_PPO_3 CSCO_PVO_3 CSCO_VolPct_1d CSCO_AdjClosePct_1d \\\n",
"Date \n",
"2009-03-16 -0.031996 -0.012073 0.165732 -0.004267 \n",
"2009-03-17 -0.033181 -0.022775 0.036311 0.045000 \n",
"2009-03-18 -0.033272 -0.005328 0.632135 0.022556 \n",
"2009-03-19 -0.030913 -0.012782 -0.333437 -0.016711 \n",
"2009-03-20 -0.029811 -0.005359 0.166015 -0.019714 \n",
"\n",
" CSCO_VolPct_5d ... ^IXIC_VolPct_5d \\\n",
"Date ... \n",
"2009-03-16 -0.357839 ... 0.030597 \n",
"2009-03-17 -0.273935 ... -0.121412 \n",
"2009-03-18 0.545722 ... 0.274729 \n",
"2009-03-19 0.142328 ... -0.028946 \n",
"2009-03-20 0.532465 ... 0.184109 \n",
"\n",
" ^IXIC_AdjClosePct_5d ^IXIC_VolPct_10d ^IXIC_AdjClosePct_10d \\\n",
"Date \n",
"2009-03-16 0.106713 0.032635 0.061360 \n",
"2009-03-17 0.076442 -0.113580 0.106812 \n",
"2009-03-18 0.087180 0.199451 0.101556 \n",
"2009-03-19 0.040236 -0.001002 0.141498 \n",
"2009-03-20 0.018002 -0.020194 0.126305 \n",
"\n",
" ^IXIC_VolPct_15d ^IXIC_AdjClosePct_15d ^IXIC_VolPct_20d \\\n",
"Date \n",
"2009-03-16 0.061545 0.011746 0.054904 \n",
"2009-03-17 -0.113876 0.014065 -0.112248 \n",
"2009-03-18 0.178892 0.046154 0.362239 \n",
"2009-03-19 0.009348 0.066124 0.167007 \n",
"2009-03-20 0.000497 0.057648 -0.052085 \n",
"\n",
" ^IXIC_AdjClosePct_20d ^IXIC_VolPct_30d ^IXIC_AdjClosePct_30d \n",
"Date \n",
"2009-03-16 -0.084947 0.021839 -0.049038 \n",
"2009-03-17 -0.005814 0.043355 -0.021627 \n",
"2009-03-18 0.015838 0.348861 -0.016540 \n",
"2009-03-19 0.028181 0.057559 -0.020838 \n",
"2009-03-20 0.011129 -0.046693 -0.057540 \n",
"\n",
"[5 rows x 58 columns]"
]
}
],
"prompt_number": 223
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Python libraries and functions (load first)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import pandas.io.data\n",
"import math\n",
"\n",
"def PXO(df, col, window_short = 12, window_long = 26):\n",
" return (pd.stats.moments.rolling_mean(df[col], window = window_short) - pd.stats.moments.rolling_mean(df[col], window = window_long)) / pd.stats.moments.rolling_mean(df[col], window = window_long)\n",
"\n",
"def DSA(df):\n",
" return (df.High -df.Low)/df.Open\n",
"\n",
"def pct_change(df, col, periods = 1):\n",
" return df[col].pct_change(periods=periods)\n",
"\n",
"def create_train_test(df, train_size, name):\n",
" df.ix[0:int(math.floor(train_size * len(df)))-1, :].to_csv(name + '_train.csv')\n",
" test = df.ix[int(math.floor(train_size * len(df))):len(df)-1, :]\n",
" test['UID'] = range(len(test))\n",
" test.to_csv(name + '_test.csv')\n",
" \n",
"def generate_df(df, ext, forward = True):\n",
" dfx = df.copy()\n",
" df_new = pd.DataFrame()\n",
" df_new['DSA'] = DSA(dfx)\n",
"\n",
" # PPO and PVO\n",
" for i in range(len(short_term_ma)):\n",
" df_new['PPO_' + str(i+1)] = PXO(dfx, 'Adj Close', short_term_ma[i], long_term_ma[i])\n",
" df_new['PVO_' + str(i+1)] = PXO(dfx, 'Volume', short_term_ma[i], long_term_ma[i])\n",
"\n",
" # Volume and adjClose % changes for different periods\n",
" for i in range(len(days)):\n",
" df_new['VolPct_' + str(days[i]) + 'd'] = pct_change(dfx, 'Volume', periods = days[i])\n",
" df_new['AdjClosePct_' + str(days[i]) + 'd'] = pct_change(dfx, 'Adj Close', periods = days[i])\n",
"\n",
" # Forward price % change\n",
" if forward:\n",
" #for i in range(-1,-(forward_days + 1),-1):\n",
" df_new['AdjClosePct_' + str(abs(forward_day)) + 'd_Fwd'] = pct_change(dfx, 'Volume', periods = forward_day)\n",
" \n",
" return df_new.rename(columns=lambda x: ext + '_' + x)"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment