Created
February 27, 2015 21:40
-
-
Save amirziai/31cc3277c3812faffed0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:dbe7091a498275c9754ee330eaf46671194336228630999420591891e5b780f9" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Customizable ticker dataset generator" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This script will generate a dataset based on the tickers you are interested in\n", | |
"<br>\n", | |
"The data is pulled directly from Yahoo! Finance API and variables are generated\n", | |
"<br>" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Enter ticker, industry and broad index" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Example is HPQ, MTK (Morgan Stanley Tech SPDRS), and NASDAQ (^IXIC)\n", | |
"<br>\n", | |
"\"train_size\" is percentage of data you want to be used for training, the rest will be used for testing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"start_date = '2009/1/1'\n", | |
"ticker_name = 'CSCO'\n", | |
"industry_name = 'MTK'\n", | |
"index_name = '^IXIC'\n", | |
"output_filename = ticker_name + '_' + industry_name + '_' + index_name\n", | |
"\n", | |
"train_size = 9./10" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 219 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"ticker = pandas.io.data.DataReader(ticker_name,'yahoo', start = start_date)\n", | |
"industry = mtk = pandas.io.data.DataReader(industry_name,'yahoo', start = start_date)\n", | |
"index = pandas.io.data.DataReader(index_name,'yahoo', start = start_date)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 220 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Paramters for creating variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"days = [1, 5, 10, 15, 20, 30]\n", | |
"forward_day = 11\n", | |
"short_term_ma = [5, 12, 26]\n", | |
"long_term_ma = [12, 26, 50]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 221 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"tickerx = generate_df(ticker, ticker_name)\n", | |
"industryx = generate_df(mtk, industry_name, False)\n", | |
"indexx = generate_df(nas, index_name, False)\n", | |
"final = tickerx.join(industryx.join(indexx))\n", | |
"final.dropna(how='any', inplace=True)\n", | |
"final.to_csv(output_filename + '.csv')\n", | |
"create_train_test(final, train_size, output_filename)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 222 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"final.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>CSCO_DSA</th>\n", | |
" <th>CSCO_PPO_1</th>\n", | |
" <th>CSCO_PVO_1</th>\n", | |
" <th>CSCO_PPO_2</th>\n", | |
" <th>CSCO_PVO_2</th>\n", | |
" <th>CSCO_PPO_3</th>\n", | |
" <th>CSCO_PVO_3</th>\n", | |
" <th>CSCO_VolPct_1d</th>\n", | |
" <th>CSCO_AdjClosePct_1d</th>\n", | |
" <th>CSCO_VolPct_5d</th>\n", | |
" <th>...</th>\n", | |
" <th>^IXIC_VolPct_5d</th>\n", | |
" <th>^IXIC_AdjClosePct_5d</th>\n", | |
" <th>^IXIC_VolPct_10d</th>\n", | |
" <th>^IXIC_AdjClosePct_10d</th>\n", | |
" <th>^IXIC_VolPct_15d</th>\n", | |
" <th>^IXIC_AdjClosePct_15d</th>\n", | |
" <th>^IXIC_VolPct_20d</th>\n", | |
" <th>^IXIC_AdjClosePct_20d</th>\n", | |
" <th>^IXIC_VolPct_30d</th>\n", | |
" <th>^IXIC_AdjClosePct_30d</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2009-03-16</th>\n", | |
" <td> 0.033566</td>\n", | |
" <td> 0.032162</td>\n", | |
" <td>-0.188036</td>\n", | |
" <td>-0.028587</td>\n", | |
" <td> 0.136516</td>\n", | |
" <td>-0.031996</td>\n", | |
" <td>-0.012073</td>\n", | |
" <td> 0.165732</td>\n", | |
" <td>-0.004267</td>\n", | |
" <td>-0.357839</td>\n", | |
" <td>...</td>\n", | |
" <td> 0.030597</td>\n", | |
" <td> 0.106713</td>\n", | |
" <td> 0.032635</td>\n", | |
" <td> 0.061360</td>\n", | |
" <td> 0.061545</td>\n", | |
" <td> 0.011746</td>\n", | |
" <td> 0.054904</td>\n", | |
" <td>-0.084947</td>\n", | |
" <td> 0.021839</td>\n", | |
" <td>-0.049038</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-03-17</th>\n", | |
" <td> 0.036896</td>\n", | |
" <td> 0.043272</td>\n", | |
" <td>-0.223184</td>\n", | |
" <td>-0.017766</td>\n", | |
" <td> 0.109283</td>\n", | |
" <td>-0.033181</td>\n", | |
" <td>-0.022775</td>\n", | |
" <td> 0.036311</td>\n", | |
" <td> 0.045000</td>\n", | |
" <td>-0.273935</td>\n", | |
" <td>...</td>\n", | |
" <td>-0.121412</td>\n", | |
" <td> 0.076442</td>\n", | |
" <td>-0.113580</td>\n", | |
" <td> 0.106812</td>\n", | |
" <td>-0.113876</td>\n", | |
" <td> 0.014065</td>\n", | |
" <td>-0.112248</td>\n", | |
" <td>-0.005814</td>\n", | |
" <td> 0.043355</td>\n", | |
" <td>-0.021627</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-03-18</th>\n", | |
" <td> 0.057392</td>\n", | |
" <td> 0.050439</td>\n", | |
" <td>-0.157317</td>\n", | |
" <td>-0.004887</td>\n", | |
" <td> 0.110437</td>\n", | |
" <td>-0.033272</td>\n", | |
" <td>-0.005328</td>\n", | |
" <td> 0.632135</td>\n", | |
" <td> 0.022556</td>\n", | |
" <td> 0.545722</td>\n", | |
" <td>...</td>\n", | |
" <td> 0.274729</td>\n", | |
" <td> 0.087180</td>\n", | |
" <td> 0.199451</td>\n", | |
" <td> 0.101556</td>\n", | |
" <td> 0.178892</td>\n", | |
" <td> 0.046154</td>\n", | |
" <td> 0.362239</td>\n", | |
" <td> 0.015838</td>\n", | |
" <td> 0.348861</td>\n", | |
" <td>-0.016540</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-03-19</th>\n", | |
" <td> 0.037170</td>\n", | |
" <td> 0.049266</td>\n", | |
" <td>-0.129913</td>\n", | |
" <td> 0.004805</td>\n", | |
" <td> 0.111483</td>\n", | |
" <td>-0.030913</td>\n", | |
" <td>-0.012782</td>\n", | |
" <td>-0.333437</td>\n", | |
" <td>-0.016711</td>\n", | |
" <td> 0.142328</td>\n", | |
" <td>...</td>\n", | |
" <td>-0.028946</td>\n", | |
" <td> 0.040236</td>\n", | |
" <td>-0.001002</td>\n", | |
" <td> 0.141498</td>\n", | |
" <td> 0.009348</td>\n", | |
" <td> 0.066124</td>\n", | |
" <td> 0.167007</td>\n", | |
" <td> 0.028181</td>\n", | |
" <td> 0.057559</td>\n", | |
" <td>-0.020838</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2009-03-20</th>\n", | |
" <td> 0.050092</td>\n", | |
" <td> 0.050677</td>\n", | |
" <td>-0.056182</td>\n", | |
" <td> 0.009127</td>\n", | |
" <td> 0.092569</td>\n", | |
" <td>-0.029811</td>\n", | |
" <td>-0.005359</td>\n", | |
" <td> 0.166015</td>\n", | |
" <td>-0.019714</td>\n", | |
" <td> 0.532465</td>\n", | |
" <td>...</td>\n", | |
" <td> 0.184109</td>\n", | |
" <td> 0.018002</td>\n", | |
" <td>-0.020194</td>\n", | |
" <td> 0.126305</td>\n", | |
" <td> 0.000497</td>\n", | |
" <td> 0.057648</td>\n", | |
" <td>-0.052085</td>\n", | |
" <td> 0.011129</td>\n", | |
" <td>-0.046693</td>\n", | |
" <td>-0.057540</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 58 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 223, | |
"text": [ | |
" CSCO_DSA CSCO_PPO_1 CSCO_PVO_1 CSCO_PPO_2 CSCO_PVO_2 \\\n", | |
"Date \n", | |
"2009-03-16 0.033566 0.032162 -0.188036 -0.028587 0.136516 \n", | |
"2009-03-17 0.036896 0.043272 -0.223184 -0.017766 0.109283 \n", | |
"2009-03-18 0.057392 0.050439 -0.157317 -0.004887 0.110437 \n", | |
"2009-03-19 0.037170 0.049266 -0.129913 0.004805 0.111483 \n", | |
"2009-03-20 0.050092 0.050677 -0.056182 0.009127 0.092569 \n", | |
"\n", | |
" CSCO_PPO_3 CSCO_PVO_3 CSCO_VolPct_1d CSCO_AdjClosePct_1d \\\n", | |
"Date \n", | |
"2009-03-16 -0.031996 -0.012073 0.165732 -0.004267 \n", | |
"2009-03-17 -0.033181 -0.022775 0.036311 0.045000 \n", | |
"2009-03-18 -0.033272 -0.005328 0.632135 0.022556 \n", | |
"2009-03-19 -0.030913 -0.012782 -0.333437 -0.016711 \n", | |
"2009-03-20 -0.029811 -0.005359 0.166015 -0.019714 \n", | |
"\n", | |
" CSCO_VolPct_5d ... ^IXIC_VolPct_5d \\\n", | |
"Date ... \n", | |
"2009-03-16 -0.357839 ... 0.030597 \n", | |
"2009-03-17 -0.273935 ... -0.121412 \n", | |
"2009-03-18 0.545722 ... 0.274729 \n", | |
"2009-03-19 0.142328 ... -0.028946 \n", | |
"2009-03-20 0.532465 ... 0.184109 \n", | |
"\n", | |
" ^IXIC_AdjClosePct_5d ^IXIC_VolPct_10d ^IXIC_AdjClosePct_10d \\\n", | |
"Date \n", | |
"2009-03-16 0.106713 0.032635 0.061360 \n", | |
"2009-03-17 0.076442 -0.113580 0.106812 \n", | |
"2009-03-18 0.087180 0.199451 0.101556 \n", | |
"2009-03-19 0.040236 -0.001002 0.141498 \n", | |
"2009-03-20 0.018002 -0.020194 0.126305 \n", | |
"\n", | |
" ^IXIC_VolPct_15d ^IXIC_AdjClosePct_15d ^IXIC_VolPct_20d \\\n", | |
"Date \n", | |
"2009-03-16 0.061545 0.011746 0.054904 \n", | |
"2009-03-17 -0.113876 0.014065 -0.112248 \n", | |
"2009-03-18 0.178892 0.046154 0.362239 \n", | |
"2009-03-19 0.009348 0.066124 0.167007 \n", | |
"2009-03-20 0.000497 0.057648 -0.052085 \n", | |
"\n", | |
" ^IXIC_AdjClosePct_20d ^IXIC_VolPct_30d ^IXIC_AdjClosePct_30d \n", | |
"Date \n", | |
"2009-03-16 -0.084947 0.021839 -0.049038 \n", | |
"2009-03-17 -0.005814 0.043355 -0.021627 \n", | |
"2009-03-18 0.015838 0.348861 -0.016540 \n", | |
"2009-03-19 0.028181 0.057559 -0.020838 \n", | |
"2009-03-20 0.011129 -0.046693 -0.057540 \n", | |
"\n", | |
"[5 rows x 58 columns]" | |
] | |
} | |
], | |
"prompt_number": 223 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Python libraries and functions (load first)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import pandas.io.data\n", | |
"import math\n", | |
"\n", | |
"def PXO(df, col, window_short = 12, window_long = 26):\n", | |
" return (pd.stats.moments.rolling_mean(df[col], window = window_short) - pd.stats.moments.rolling_mean(df[col], window = window_long)) / pd.stats.moments.rolling_mean(df[col], window = window_long)\n", | |
"\n", | |
"def DSA(df):\n", | |
" return (df.High -df.Low)/df.Open\n", | |
"\n", | |
"def pct_change(df, col, periods = 1):\n", | |
" return df[col].pct_change(periods=periods)\n", | |
"\n", | |
"def create_train_test(df, train_size, name):\n", | |
" df.ix[0:int(math.floor(train_size * len(df)))-1, :].to_csv(name + '_train.csv')\n", | |
" test = df.ix[int(math.floor(train_size * len(df))):len(df)-1, :]\n", | |
" test['UID'] = range(len(test))\n", | |
" test.to_csv(name + '_test.csv')\n", | |
" \n", | |
"def generate_df(df, ext, forward = True):\n", | |
" dfx = df.copy()\n", | |
" df_new = pd.DataFrame()\n", | |
" df_new['DSA'] = DSA(dfx)\n", | |
"\n", | |
" # PPO and PVO\n", | |
" for i in range(len(short_term_ma)):\n", | |
" df_new['PPO_' + str(i+1)] = PXO(dfx, 'Adj Close', short_term_ma[i], long_term_ma[i])\n", | |
" df_new['PVO_' + str(i+1)] = PXO(dfx, 'Volume', short_term_ma[i], long_term_ma[i])\n", | |
"\n", | |
" # Volume and adjClose % changes for different periods\n", | |
" for i in range(len(days)):\n", | |
" df_new['VolPct_' + str(days[i]) + 'd'] = pct_change(dfx, 'Volume', periods = days[i])\n", | |
" df_new['AdjClosePct_' + str(days[i]) + 'd'] = pct_change(dfx, 'Adj Close', periods = days[i])\n", | |
"\n", | |
" # Forward price % change\n", | |
" if forward:\n", | |
" #for i in range(-1,-(forward_days + 1),-1):\n", | |
" df_new['AdjClosePct_' + str(abs(forward_day)) + 'd_Fwd'] = pct_change(dfx, 'Volume', periods = forward_day)\n", | |
" \n", | |
" return df_new.rename(columns=lambda x: ext + '_' + x)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment