Skip to content

Instantly share code, notes, and snippets.

@va2577
Created May 15, 2018 08:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save va2577/f8b20698656d679b948b76660868972e to your computer and use it in GitHub Desktop.
Save va2577/f8b20698656d679b948b76660868972e to your computer and use it in GitHub Desktop.
ヒストリカルデータリサンプリング
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# データ\n",
"\n",
"```shell\n",
"$ head USDJPY_1\\ Min_Bid_2000.01.01_2018.01.01.csv\n",
"Time (UTC),Open,High,Low,Close,Volume\n",
"2003.05.04 21:00:00,118.94,118.952,118.94,118.952,253\n",
"2003.05.04 21:01:00,118.961,118.967,118.958,118.967,154.6\n",
"2003.05.04 21:02:00,118.972,118.972,118.955,118.955,219.7\n",
"2003.05.04 21:03:00,118.953,118.961,118.949,118.949,309.9\n",
"2003.05.04 21:04:00,118.953,118.953,118.946,118.946,229.4\n",
"2003.05.04 21:05:00,118.952,118.954,118.944,118.944,112.2\n",
"2003.05.04 21:06:00,118.95,118.952,118.945,118.945,170.2\n",
"2003.05.04 21:07:00,118.947,118.956,118.947,118.947,124.5\n",
"2003.05.04 21:08:00,118.946,118.954,118.934,118.934,355\n",
"$ wc -l USDJPY_1\\ Min_Bid_2000.01.01_2018.01.01.csv\n",
"5509561 USDJPY_1 Min_Bid_2000.01.01_2018.01.01.csv\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# データを読み込む"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.2 s, sys: 3.8 s, total: 17 s\n",
"Wall time: 20.2 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>open</th>\n",
" <th>high</th>\n",
" <th>low</th>\n",
" <th>close</th>\n",
" <th>volume</th>\n",
" </tr>\n",
" <tr>\n",
" <th>time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2003-05-04 21:00:00</th>\n",
" <td>118.940</td>\n",
" <td>118.952</td>\n",
" <td>118.940</td>\n",
" <td>118.952</td>\n",
" <td>253.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-04 21:01:00</th>\n",
" <td>118.961</td>\n",
" <td>118.967</td>\n",
" <td>118.958</td>\n",
" <td>118.967</td>\n",
" <td>154.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-04 21:02:00</th>\n",
" <td>118.972</td>\n",
" <td>118.972</td>\n",
" <td>118.955</td>\n",
" <td>118.955</td>\n",
" <td>219.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-04 21:03:00</th>\n",
" <td>118.953</td>\n",
" <td>118.961</td>\n",
" <td>118.949</td>\n",
" <td>118.949</td>\n",
" <td>309.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-04 21:04:00</th>\n",
" <td>118.953</td>\n",
" <td>118.953</td>\n",
" <td>118.946</td>\n",
" <td>118.946</td>\n",
" <td>229.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" open high low close volume\n",
"time \n",
"2003-05-04 21:00:00 118.940 118.952 118.940 118.952 253.0\n",
"2003-05-04 21:01:00 118.961 118.967 118.958 118.967 154.6\n",
"2003-05-04 21:02:00 118.972 118.972 118.955 118.955 219.7\n",
"2003-05-04 21:03:00 118.953 118.961 118.949 118.949 309.9\n",
"2003-05-04 21:04:00 118.953 118.953 118.946 118.946 229.4"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# type を指定した方がよいそうなのでコメントアウト\n",
"#%time df = pd.read_csv(filepath_or_buffer='USDJPY_1 Min_Bid_2000.01.01_2018.01.01.csv', index_col='Time (UTC)')\n",
"# CPU times: user 13.9 s, sys: 4.19 s, total: 18.1 s\n",
"# Wall time: 18.3 s\n",
"\n",
"dtype1 = { 'time': str, 'open': float, 'high': float, 'low': float, 'close': float, 'volume': float }\n",
"names1 = ['time', 'open', 'high', 'low', 'close', 'volume']\n",
"%time df = pd.read_csv(filepath_or_buffer='USDJPY_1 Min_Bid_2000.01.01_2018.01.01.csv', dtype=dtype1, header=0, index_col='time', names=names1, parse_dates=['time'])\n",
"# CPU times: user 14 s, sys: 3.95 s, total: 18 s\n",
"# Wall time: 19.1 s\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 日足のローソク足を 5 本にするため時間の計算"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 93.8 ms, sys: 312 ms, total: 406 ms\n",
"Wall time: 531 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>open</th>\n",
" <th>high</th>\n",
" <th>low</th>\n",
" <th>close</th>\n",
" <th>volume</th>\n",
" </tr>\n",
" <tr>\n",
" <th>time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2003-05-05 00:00:00</th>\n",
" <td>118.940</td>\n",
" <td>118.952</td>\n",
" <td>118.940</td>\n",
" <td>118.952</td>\n",
" <td>253.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-05 00:01:00</th>\n",
" <td>118.961</td>\n",
" <td>118.967</td>\n",
" <td>118.958</td>\n",
" <td>118.967</td>\n",
" <td>154.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-05 00:02:00</th>\n",
" <td>118.972</td>\n",
" <td>118.972</td>\n",
" <td>118.955</td>\n",
" <td>118.955</td>\n",
" <td>219.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-05 00:03:00</th>\n",
" <td>118.953</td>\n",
" <td>118.961</td>\n",
" <td>118.949</td>\n",
" <td>118.949</td>\n",
" <td>309.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-05 00:04:00</th>\n",
" <td>118.953</td>\n",
" <td>118.953</td>\n",
" <td>118.946</td>\n",
" <td>118.946</td>\n",
" <td>229.4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" open high low close volume\n",
"time \n",
"2003-05-05 00:00:00 118.940 118.952 118.940 118.952 253.0\n",
"2003-05-05 00:01:00 118.961 118.967 118.958 118.967 154.6\n",
"2003-05-05 00:02:00 118.972 118.972 118.955 118.955 219.7\n",
"2003-05-05 00:03:00 118.953 118.961 118.949 118.949 309.9\n",
"2003-05-05 00:04:00 118.953 118.953 118.946 118.946 229.4"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# timezone を指定しても resample に反映されないのでコメントアウト\n",
"#%time df.index.tz_localize('UTC').tz_convert('EET')\n",
"\n",
"# DataFrame の内容を変えたくないのでコメントアウト\n",
"# できるだけ変数は不変にしたい\n",
"#%time df.index += pd.offsets.Hour(3)\n",
"# CPU times: user 93.8 ms, sys: 78.1 ms, total: 172 ms\n",
"# Wall time: 170 ms\n",
"\n",
"# EEST(Eastern European Summer Time)\n",
"# Eastern European Summer Time (EEST) is 3 hours ahead of Coordinated Universal Time (UTC).\n",
"# 入力したデータが UTC なので、 +3 時間にする\n",
"# 2003-05-04 21:00:00(Sun) が 2003-05-05 00:00:00(Mon) となる\n",
"# 月曜日から開始になるため日足のローソク足が 6 本ではなく 5 本になる\n",
"data1 = { 'open': df['open'].values, 'high': df['high'].values, 'low': df['low'].values, 'close': df['close'].values, 'volume': df['volume'].values }\n",
"columns1 = ['open', 'high', 'low', 'close', 'volume']\n",
"index1 = df.index + pd.DateOffset(hours=3)\n",
"%time df2 = pd.DataFrame(data=data1, columns=columns1, index=index1)\n",
"# CPU times: user 125 ms, sys: 406 ms, total: 531 ms\n",
"# Wall time: 524 ms\n",
"\n",
"df2.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* [pandas.DataFrame.resample — pandas 0.22.0 documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html)\n",
"* [pandas.core.resample.Resampler.ohlc — pandas 0.22.0 documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.resample.Resampler.ohlc.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# リサンプリング"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 250 ms, sys: 78.1 ms, total: 328 ms\n",
"Wall time: 456 ms\n",
"CPU times: user 172 ms, sys: 93.8 ms, total: 266 ms\n",
"Wall time: 274 ms\n",
"CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
"Wall time: 3.03 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>open</th>\n",
" <th>high</th>\n",
" <th>low</th>\n",
" <th>close</th>\n",
" <th>volume</th>\n",
" </tr>\n",
" <tr>\n",
" <th>time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2003-05-05</th>\n",
" <td>118.940</td>\n",
" <td>119.046</td>\n",
" <td>118.461</td>\n",
" <td>118.603</td>\n",
" <td>592866.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-06</th>\n",
" <td>118.591</td>\n",
" <td>118.751</td>\n",
" <td>117.290</td>\n",
" <td>117.500</td>\n",
" <td>581707.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-07</th>\n",
" <td>117.456</td>\n",
" <td>117.830</td>\n",
" <td>116.052</td>\n",
" <td>116.303</td>\n",
" <td>584496.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-08</th>\n",
" <td>116.311</td>\n",
" <td>116.969</td>\n",
" <td>115.940</td>\n",
" <td>116.823</td>\n",
" <td>588236.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2003-05-09</th>\n",
" <td>116.835</td>\n",
" <td>117.612</td>\n",
" <td>116.794</td>\n",
" <td>117.151</td>\n",
" <td>583132.9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" open high low close volume\n",
"time \n",
"2003-05-05 118.940 119.046 118.461 118.603 592866.9\n",
"2003-05-06 118.591 118.751 117.290 117.500 581707.0\n",
"2003-05-07 117.456 117.830 116.052 116.303 584496.2\n",
"2003-05-08 116.311 116.969 115.940 116.823 588236.7\n",
"2003-05-09 116.835 117.612 116.794 117.151 583132.9"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# resample のパラメーターの how は pandas 0.22.0 では deprecated になっているためコメントアウト\n",
"#%time df3 = df.resample(rule='D', how='ohlc')\n",
"# /path/to/dir/lib/python3.5/site-packages/ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated\n",
"# the new syntax is .resample(...).ohlc()\n",
"\n",
"# 日足の始値、高値、安値、終値を計算する\n",
"# Series ごとに始値、高値、安値、終値の計算結果が得られる\n",
"# 始値を参照する場合は df['open']['open'] のように記述する\n",
"%time df3 = df2.resample(rule='D').ohlc()\n",
"# CPU times: user 250 ms, sys: 78.1 ms, total: 328 ms\n",
"# Wall time: 348 ms\n",
"\n",
"# 日足の出来高を計算する\n",
"%time df4 = df2.resample(rule='D').sum()\n",
"# CPU times: user 156 ms, sys: 93.8 ms, total: 250 ms\n",
"# Wall time: 241 ms\n",
"\n",
"# 日足の DataFrame\n",
"# 土日も計算結果に含まれるため dropna で取り除く\n",
"data2 = { 'open': df3['open']['open'].values, 'high': df3['high']['high'].values, 'low': df3['low']['low'].values, 'close': df3['close']['close'].values, 'volume': df4['volume'].values }\n",
"columns2 = ['open', 'high', 'low', 'close', 'volume']\n",
"%time df5 = pd.DataFrame(data=data2, columns=columns2, index=df3.index).dropna()\n",
"# CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
"# Wall time: 2.91 ms\n",
"\n",
"df5.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 78.1 ms, sys: 0 ns, total: 78.1 ms\n",
"Wall time: 184 ms\n"
]
},
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7fbb306e9518>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%time df5.plot(y='close')\n",
"# CPU times: user 125 ms, sys: 46.9 ms, total: 172 ms\n",
"# Wall time: 198 ms"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df5.to_csv(path_or_buf='USDJPY_daily.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment