Created
February 17, 2023 00:46
-
-
Save nagishin/05ca907559dc758e17c4bd0ff5d71d12 to your computer and use it in GitHub Desktop.
polars_csv_io.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyPxuAlO+/TUfvbThZCXU7jJ", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/nagishin/05ca907559dc758e17c4bd0ff5d71d12/polars_csv_io.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "xzUVxly7XSep" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install polars\n", | |
"!pip install line_profiler\n", | |
"!pip install memory_profiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%load_ext line_profiler\n", | |
"%load_ext memory_profiler" | |
], | |
"metadata": { | |
"id": "amGbcBfJbhRG" | |
}, | |
"execution_count": 28, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## 【初期設定】" | |
], | |
"metadata": { | |
"id": "dDWBU_d3x9wm" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"import time\n", | |
"from datetime import datetime\n", | |
"import pandas as pd\n", | |
"import polars as pl\n", | |
"\n", | |
"SYMBOL = 'BTCUSDT' # 通貨ペア\n", | |
"DATE = '2023-01-18' # 取得日\n", | |
"\n", | |
"DL_PATH = f'https://public.bybit.com/trading/{SYMBOL}/{SYMBOL}{DATE}.csv.gz' # ダウンロードパス\n", | |
"FILE_PATH = f'./trades_{SYMBOL}_{DATE}.csv' # CSVパス\n", | |
"\n", | |
"# ファイル存在チェック\n", | |
"if os.path.isfile(FILE_PATH) == False:\n", | |
" df_csv = pd.read_csv(DL_PATH, compression='gzip') # bybit約定履歴 1日分取得\n", | |
" df_csv.to_csv(FILE_PATH, header=True, index=False) # ローカルに保存\n", | |
"else:\n", | |
" df_csv = pd.read_csv(FILE_PATH) # ローカルCSV読み込み\n", | |
"\n", | |
"csv_records = len(df_csv)\n", | |
"csv_size = round(os.path.getsize(FILE_PATH)/1024/1024, 3)\n", | |
"\n", | |
"print(f'[約定履歴CSV]: {FILE_PATH}')\n", | |
"print(f'[レコード数 ]: {csv_records:,} rows')\n", | |
"print(f'[CSVサイズ ]: {csv_size:,} MB')\n", | |
"display(df_csv.head())" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 261 | |
}, | |
"id": "e750Ad3nXUgD", | |
"outputId": "2d6a66fa-15c9-4c1a-8976-24ffe1768689" | |
}, | |
"execution_count": 65, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"[約定履歴CSV]: ./trades_BTCUSDT_2023-01-18.csv\n", | |
"[レコード数 ]: 1,386,138 rows\n", | |
"[CSVサイズ ]: 160.152 MB\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" timestamp symbol side size price tickDirection \\\n", | |
"0 1.674000e+09 BTCUSDT Sell 0.010 21118.0 ZeroMinusTick \n", | |
"1 1.674000e+09 BTCUSDT Sell 0.024 21118.0 ZeroMinusTick \n", | |
"2 1.674000e+09 BTCUSDT Sell 0.063 21118.0 ZeroMinusTick \n", | |
"3 1.674000e+09 BTCUSDT Buy 0.001 21118.5 PlusTick \n", | |
"4 1.674000e+09 BTCUSDT Sell 0.337 21118.0 MinusTick \n", | |
"\n", | |
" trdMatchID grossValue homeNotional \\\n", | |
"0 9165901d-8b04-5bdb-bba5-7365be082394 2.111800e+10 0.010 \n", | |
"1 85505b5d-6386-5054-a61d-f332e83ba3e6 5.068320e+10 0.024 \n", | |
"2 749209ec-92a7-5b8d-81d8-1dd0fdc9a6f4 1.330434e+11 0.063 \n", | |
"3 98f7e7ec-a266-5909-868c-1677554a79f9 2.111850e+09 0.001 \n", | |
"4 ba378ff8-3c09-5294-a2d4-a4c9defad9aa 7.116766e+11 0.337 \n", | |
"\n", | |
" foreignNotional \n", | |
"0 211.1800 \n", | |
"1 506.8320 \n", | |
"2 1330.4340 \n", | |
"3 21.1185 \n", | |
"4 7116.7660 " | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-0bbdf472-f08c-4559-96d9-6f60ca130263\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>timestamp</th>\n", | |
" <th>symbol</th>\n", | |
" <th>side</th>\n", | |
" <th>size</th>\n", | |
" <th>price</th>\n", | |
" <th>tickDirection</th>\n", | |
" <th>trdMatchID</th>\n", | |
" <th>grossValue</th>\n", | |
" <th>homeNotional</th>\n", | |
" <th>foreignNotional</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.674000e+09</td>\n", | |
" <td>BTCUSDT</td>\n", | |
" <td>Sell</td>\n", | |
" <td>0.010</td>\n", | |
" <td>21118.0</td>\n", | |
" <td>ZeroMinusTick</td>\n", | |
" <td>9165901d-8b04-5bdb-bba5-7365be082394</td>\n", | |
" <td>2.111800e+10</td>\n", | |
" <td>0.010</td>\n", | |
" <td>211.1800</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1.674000e+09</td>\n", | |
" <td>BTCUSDT</td>\n", | |
" <td>Sell</td>\n", | |
" <td>0.024</td>\n", | |
" <td>21118.0</td>\n", | |
" <td>ZeroMinusTick</td>\n", | |
" <td>85505b5d-6386-5054-a61d-f332e83ba3e6</td>\n", | |
" <td>5.068320e+10</td>\n", | |
" <td>0.024</td>\n", | |
" <td>506.8320</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1.674000e+09</td>\n", | |
" <td>BTCUSDT</td>\n", | |
" <td>Sell</td>\n", | |
" <td>0.063</td>\n", | |
" <td>21118.0</td>\n", | |
" <td>ZeroMinusTick</td>\n", | |
" <td>749209ec-92a7-5b8d-81d8-1dd0fdc9a6f4</td>\n", | |
" <td>1.330434e+11</td>\n", | |
" <td>0.063</td>\n", | |
" <td>1330.4340</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1.674000e+09</td>\n", | |
" <td>BTCUSDT</td>\n", | |
" <td>Buy</td>\n", | |
" <td>0.001</td>\n", | |
" <td>21118.5</td>\n", | |
" <td>PlusTick</td>\n", | |
" <td>98f7e7ec-a266-5909-868c-1677554a79f9</td>\n", | |
" <td>2.111850e+09</td>\n", | |
" <td>0.001</td>\n", | |
" <td>21.1185</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1.674000e+09</td>\n", | |
" <td>BTCUSDT</td>\n", | |
" <td>Sell</td>\n", | |
" <td>0.337</td>\n", | |
" <td>21118.0</td>\n", | |
" <td>MinusTick</td>\n", | |
" <td>ba378ff8-3c09-5294-a2d4-a4c9defad9aa</td>\n", | |
" <td>7.116766e+11</td>\n", | |
" <td>0.337</td>\n", | |
" <td>7116.7660</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0bbdf472-f08c-4559-96d9-6f60ca130263')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-0bbdf472-f08c-4559-96d9-6f60ca130263 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-0bbdf472-f08c-4559-96d9-6f60ca130263');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## 【CSV読み込み/書き出し比較】" | |
], | |
"metadata": { | |
"id": "b2f8eshlyAMW" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# ① pandasのみ\n", | |
"def profile_pandas(csv_path):\n", | |
" df_pd = pd.read_csv(csv_path) # csv読み込み 3001.1ms\n", | |
" df_pd.to_csv('pd.csv') # csv書き出し 13132.4ms\n", | |
"\n", | |
"# ② polarsでIO+pandas変換\n", | |
"def profile_polars(csv_path):\n", | |
" df_pl = pl.read_csv(csv_path) # csv読み込み 1016.6ms\n", | |
" df_pd = df_pl.to_pandas() # DataFrame(polars->pandas)変換 1009.2ms\n", | |
" df_pl2 = pl.from_pandas(df_pd) # DataFrame(pandas->polars)変換 266.1ms\n", | |
" df_pl2.write_csv('pl.csv') # csv書き出し 2387.2ms\n", | |
"\n", | |
"# ①と②比較\n", | |
"def profile_rw_csv(csv_path):\n", | |
" # csv読み込み\n", | |
" df1 = pd.read_csv(csv_path) # pandas 3204.8ms\n", | |
" df2 = pl.read_csv(csv_path).to_pandas() # polars to pandas 1935.4ms\n", | |
" # csv書き出し\n", | |
" df1.to_csv('pd.csv') # pandas 13208.7ms\n", | |
" pl.from_pandas(df2).write_csv('pl.csv') # pandas to polars 2186.8ms\n" | |
], | |
"metadata": { | |
"id": "VzAz6nRPXzJr" | |
}, | |
"execution_count": 75, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%lprun -u 1e-3 -f profile_pandas profile_pandas(FILE_PATH)" | |
], | |
"metadata": { | |
"id": "MpRk1_UZdK7D" | |
}, | |
"execution_count": 67, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%lprun -u 1e-3 -f profile_polars profile_polars(FILE_PATH)" | |
], | |
"metadata": { | |
"id": "g8zkFP_qe_PL" | |
}, | |
"execution_count": 68, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%lprun -u 1e-3 -f profile_rw_csv profile_rw_csv(FILE_PATH)" | |
], | |
"metadata": { | |
"id": "P-jd-bYgjZU8" | |
}, | |
"execution_count": 69, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%memit profile_pandas(FILE_PATH)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "QLnGVbQAstCl", | |
"outputId": "58fca234-2f34-4972-e9df-60e47e525aff" | |
}, | |
"execution_count": 70, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"peak memory: 2249.28 MiB, increment: 51.43 MiB\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%memit profile_polars(FILE_PATH)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "mx1LFHCZsx0t", | |
"outputId": "c381a863-cce9-4db3-c0df-f99aff2d668d" | |
}, | |
"execution_count": 71, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"peak memory: 2816.91 MiB, increment: 900.19 MiB\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## 【約定履歴CSV読み込み⇒OHLCVリサンプリング 比較】" | |
], | |
"metadata": { | |
"id": "Vdb0v66c0vd3" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# pandas 3039.68ms\n", | |
"def profile_pandas2(csv_path):\n", | |
" # 約定履歴csvをDataFrameに読み込み\n", | |
" df_trade_pd = pd.read_csv(FILE_PATH,\n", | |
" usecols=['timestamp', 'side', 'price', 'size'],\n", | |
" dtype={'timestamp':'float', 'side':'str', 'price':'float', 'size':'float'})\n", | |
" # UnixTime(timestamp)からdatetime列生成\n", | |
" df_trade_pd['datetime'] = pd.to_datetime(df_trade_pd['timestamp'], unit='s', utc=True)\n", | |
" # datetimeをindexに設定 (resampleのため)\n", | |
" df_trade_pd.set_index('datetime', inplace=True)\n", | |
" # 約定履歴 -> OHLCV\n", | |
" df_ohlcv_pd = df_trade_pd.resample('1T').agg({\n", | |
" 'price' : 'ohlc',\n", | |
" 'size' : 'sum',}).ffill()\n", | |
" # 列名編集\n", | |
" df_ohlcv_pd.columns = ['open', 'high', 'low', 'close', 'volume']\n", | |
"\n", | |
"# polars 644.69ms\n", | |
"def profile_polars2(csv_path):\n", | |
" df_ohlcv_pl = (\n", | |
" pl.scan_csv(FILE_PATH) # 約定履歴csvをDataFrameに読み込み\n", | |
" .with_columns( # 列追加\n", | |
" (pl.col('timestamp') * 1_000_000_000) # timestamp(秒)をnano秒単位に\n", | |
" .cast(pl.Datetime(time_unit='ns')) # datetime(nano秒)変換\n", | |
" .dt.replace_time_zone('UTC') # タイムゾーン設定\n", | |
" .alias('datetime') # 列名設定\n", | |
" )\n", | |
" .groupby_dynamic('datetime', every='1m') # datetimeを1分毎にグループ化\n", | |
" .agg([\n", | |
" pl.col('price').first().alias('open'),\n", | |
" pl.col('price').max().alias('high'),\n", | |
" pl.col('price').min().alias('low'),\n", | |
" pl.col('price').last().alias('close'),\n", | |
" pl.col('size').sum().alias('volume')\n", | |
" ]\n", | |
" )\n", | |
" .collect() # 遅延評価 実行\n", | |
" )" | |
], | |
"metadata": { | |
"id": "G1RWO-DzYlbL" | |
}, | |
"execution_count": 76, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%lprun -u 1e-3 -f profile_pandas2 profile_pandas2(FILE_PATH)" | |
], | |
"metadata": { | |
"id": "QBFuSb0ovHNl" | |
}, | |
"execution_count": 73, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%lprun -u 1e-3 -f profile_polars2 profile_polars2(FILE_PATH)" | |
], | |
"metadata": { | |
"id": "a83ff_RZvH0F" | |
}, | |
"execution_count": 74, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment