Skip to content

Instantly share code, notes, and snippets.

@nagishin
Created February 17, 2023 00:46
Show Gist options
  • Save nagishin/05ca907559dc758e17c4bd0ff5d71d12 to your computer and use it in GitHub Desktop.
Save nagishin/05ca907559dc758e17c4bd0ff5d71d12 to your computer and use it in GitHub Desktop.
polars_csv_io.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPxuAlO+/TUfvbThZCXU7jJ",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/nagishin/05ca907559dc758e17c4bd0ff5d71d12/polars_csv_io.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xzUVxly7XSep"
},
"outputs": [],
"source": [
"!pip install polars\n",
"!pip install line_profiler\n",
"!pip install memory_profiler"
]
},
{
"cell_type": "code",
"source": [
"%load_ext line_profiler\n",
"%load_ext memory_profiler"
],
"metadata": {
"id": "amGbcBfJbhRG"
},
"execution_count": 28,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## 【初期設定】"
],
"metadata": {
"id": "dDWBU_d3x9wm"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"import time\n",
"from datetime import datetime\n",
"import pandas as pd\n",
"import polars as pl\n",
"\n",
"SYMBOL = 'BTCUSDT' # 通貨ペア\n",
"DATE = '2023-01-18' # 取得日\n",
"\n",
"DL_PATH = f'https://public.bybit.com/trading/{SYMBOL}/{SYMBOL}{DATE}.csv.gz' # ダウンロードパス\n",
"FILE_PATH = f'./trades_{SYMBOL}_{DATE}.csv' # CSVパス\n",
"\n",
"# ファイル存在チェック\n",
"if os.path.isfile(FILE_PATH) == False:\n",
" df_csv = pd.read_csv(DL_PATH, compression='gzip') # bybit約定履歴 1日分取得\n",
" df_csv.to_csv(FILE_PATH, header=True, index=False) # ローカルに保存\n",
"else:\n",
" df_csv = pd.read_csv(FILE_PATH) # ローカルCSV読み込み\n",
"\n",
"csv_records = len(df_csv)\n",
"csv_size = round(os.path.getsize(FILE_PATH)/1024/1024, 3)\n",
"\n",
"print(f'[約定履歴CSV]: {FILE_PATH}')\n",
"print(f'[レコード数 ]: {csv_records:,} rows')\n",
"print(f'[CSVサイズ ]: {csv_size:,} MB')\n",
"display(df_csv.head())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 261
},
"id": "e750Ad3nXUgD",
"outputId": "2d6a66fa-15c9-4c1a-8976-24ffe1768689"
},
"execution_count": 65,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[約定履歴CSV]: ./trades_BTCUSDT_2023-01-18.csv\n",
"[レコード数 ]: 1,386,138 rows\n",
"[CSVサイズ ]: 160.152 MB\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" timestamp symbol side size price tickDirection \\\n",
"0 1.674000e+09 BTCUSDT Sell 0.010 21118.0 ZeroMinusTick \n",
"1 1.674000e+09 BTCUSDT Sell 0.024 21118.0 ZeroMinusTick \n",
"2 1.674000e+09 BTCUSDT Sell 0.063 21118.0 ZeroMinusTick \n",
"3 1.674000e+09 BTCUSDT Buy 0.001 21118.5 PlusTick \n",
"4 1.674000e+09 BTCUSDT Sell 0.337 21118.0 MinusTick \n",
"\n",
" trdMatchID grossValue homeNotional \\\n",
"0 9165901d-8b04-5bdb-bba5-7365be082394 2.111800e+10 0.010 \n",
"1 85505b5d-6386-5054-a61d-f332e83ba3e6 5.068320e+10 0.024 \n",
"2 749209ec-92a7-5b8d-81d8-1dd0fdc9a6f4 1.330434e+11 0.063 \n",
"3 98f7e7ec-a266-5909-868c-1677554a79f9 2.111850e+09 0.001 \n",
"4 ba378ff8-3c09-5294-a2d4-a4c9defad9aa 7.116766e+11 0.337 \n",
"\n",
" foreignNotional \n",
"0 211.1800 \n",
"1 506.8320 \n",
"2 1330.4340 \n",
"3 21.1185 \n",
"4 7116.7660 "
],
"text/html": [
"\n",
" <div id=\"df-0bbdf472-f08c-4559-96d9-6f60ca130263\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>timestamp</th>\n",
" <th>symbol</th>\n",
" <th>side</th>\n",
" <th>size</th>\n",
" <th>price</th>\n",
" <th>tickDirection</th>\n",
" <th>trdMatchID</th>\n",
" <th>grossValue</th>\n",
" <th>homeNotional</th>\n",
" <th>foreignNotional</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.674000e+09</td>\n",
" <td>BTCUSDT</td>\n",
" <td>Sell</td>\n",
" <td>0.010</td>\n",
" <td>21118.0</td>\n",
" <td>ZeroMinusTick</td>\n",
" <td>9165901d-8b04-5bdb-bba5-7365be082394</td>\n",
" <td>2.111800e+10</td>\n",
" <td>0.010</td>\n",
" <td>211.1800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.674000e+09</td>\n",
" <td>BTCUSDT</td>\n",
" <td>Sell</td>\n",
" <td>0.024</td>\n",
" <td>21118.0</td>\n",
" <td>ZeroMinusTick</td>\n",
" <td>85505b5d-6386-5054-a61d-f332e83ba3e6</td>\n",
" <td>5.068320e+10</td>\n",
" <td>0.024</td>\n",
" <td>506.8320</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.674000e+09</td>\n",
" <td>BTCUSDT</td>\n",
" <td>Sell</td>\n",
" <td>0.063</td>\n",
" <td>21118.0</td>\n",
" <td>ZeroMinusTick</td>\n",
" <td>749209ec-92a7-5b8d-81d8-1dd0fdc9a6f4</td>\n",
" <td>1.330434e+11</td>\n",
" <td>0.063</td>\n",
" <td>1330.4340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.674000e+09</td>\n",
" <td>BTCUSDT</td>\n",
" <td>Buy</td>\n",
" <td>0.001</td>\n",
" <td>21118.5</td>\n",
" <td>PlusTick</td>\n",
" <td>98f7e7ec-a266-5909-868c-1677554a79f9</td>\n",
" <td>2.111850e+09</td>\n",
" <td>0.001</td>\n",
" <td>21.1185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.674000e+09</td>\n",
" <td>BTCUSDT</td>\n",
" <td>Sell</td>\n",
" <td>0.337</td>\n",
" <td>21118.0</td>\n",
" <td>MinusTick</td>\n",
" <td>ba378ff8-3c09-5294-a2d4-a4c9defad9aa</td>\n",
" <td>7.116766e+11</td>\n",
" <td>0.337</td>\n",
" <td>7116.7660</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0bbdf472-f08c-4559-96d9-6f60ca130263')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-0bbdf472-f08c-4559-96d9-6f60ca130263 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-0bbdf472-f08c-4559-96d9-6f60ca130263');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"## 【CSV読み込み/書き出し比較】"
],
"metadata": {
"id": "b2f8eshlyAMW"
}
},
{
"cell_type": "code",
"source": [
"# ① pandasのみ\n",
"def profile_pandas(csv_path):\n",
" df_pd = pd.read_csv(csv_path) # csv読み込み 3001.1ms\n",
" df_pd.to_csv('pd.csv') # csv書き出し 13132.4ms\n",
"\n",
"# ② polarsでIO+pandas変換\n",
"def profile_polars(csv_path):\n",
" df_pl = pl.read_csv(csv_path) # csv読み込み 1016.6ms\n",
" df_pd = df_pl.to_pandas() # DataFrame(polars->pandas)変換 1009.2ms\n",
" df_pl2 = pl.from_pandas(df_pd) # DataFrame(pandas->polars)変換 266.1ms\n",
" df_pl2.write_csv('pl.csv') # csv書き出し 2387.2ms\n",
"\n",
"# ①と②比較\n",
"def profile_rw_csv(csv_path):\n",
" # csv読み込み\n",
" df1 = pd.read_csv(csv_path) # pandas 3204.8ms\n",
" df2 = pl.read_csv(csv_path).to_pandas() # polars to pandas 1935.4ms\n",
" # csv書き出し\n",
" df1.to_csv('pd.csv') # pandas 13208.7ms\n",
" pl.from_pandas(df2).write_csv('pl.csv') # pandas to polars 2186.8ms\n"
],
"metadata": {
"id": "VzAz6nRPXzJr"
},
"execution_count": 75,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%lprun -u 1e-3 -f profile_pandas profile_pandas(FILE_PATH)"
],
"metadata": {
"id": "MpRk1_UZdK7D"
},
"execution_count": 67,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%lprun -u 1e-3 -f profile_polars profile_polars(FILE_PATH)"
],
"metadata": {
"id": "g8zkFP_qe_PL"
},
"execution_count": 68,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%lprun -u 1e-3 -f profile_rw_csv profile_rw_csv(FILE_PATH)"
],
"metadata": {
"id": "P-jd-bYgjZU8"
},
"execution_count": 69,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%memit profile_pandas(FILE_PATH)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QLnGVbQAstCl",
"outputId": "58fca234-2f34-4972-e9df-60e47e525aff"
},
"execution_count": 70,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"peak memory: 2249.28 MiB, increment: 51.43 MiB\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%memit profile_polars(FILE_PATH)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mx1LFHCZsx0t",
"outputId": "c381a863-cce9-4db3-c0df-f99aff2d668d"
},
"execution_count": 71,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"peak memory: 2816.91 MiB, increment: 900.19 MiB\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## 【約定履歴CSV読み込み⇒OHLCVリサンプリング 比較】"
],
"metadata": {
"id": "Vdb0v66c0vd3"
}
},
{
"cell_type": "code",
"source": [
"# pandas 3039.68ms\n",
"def profile_pandas2(csv_path):\n",
" # 約定履歴csvをDataFrameに読み込み\n",
" df_trade_pd = pd.read_csv(FILE_PATH,\n",
" usecols=['timestamp', 'side', 'price', 'size'],\n",
" dtype={'timestamp':'float', 'side':'str', 'price':'float', 'size':'float'})\n",
" # UnixTime(timestamp)からdatetime列生成\n",
" df_trade_pd['datetime'] = pd.to_datetime(df_trade_pd['timestamp'], unit='s', utc=True)\n",
" # datetimeをindexに設定 (resampleのため)\n",
" df_trade_pd.set_index('datetime', inplace=True)\n",
" # 約定履歴 -> OHLCV\n",
" df_ohlcv_pd = df_trade_pd.resample('1T').agg({\n",
" 'price' : 'ohlc',\n",
" 'size' : 'sum',}).ffill()\n",
" # 列名編集\n",
" df_ohlcv_pd.columns = ['open', 'high', 'low', 'close', 'volume']\n",
"\n",
"# polars 644.69ms\n",
"def profile_polars2(csv_path):\n",
" df_ohlcv_pl = (\n",
" pl.scan_csv(FILE_PATH) # 約定履歴csvをDataFrameに読み込み\n",
" .with_columns( # 列追加\n",
" (pl.col('timestamp') * 1_000_000_000) # timestamp(秒)をnano秒単位に\n",
" .cast(pl.Datetime(time_unit='ns')) # datetime(nano秒)変換\n",
" .dt.replace_time_zone('UTC') # タイムゾーン設定\n",
" .alias('datetime') # 列名設定\n",
" )\n",
" .groupby_dynamic('datetime', every='1m') # datetimeを1分毎にグループ化\n",
" .agg([\n",
" pl.col('price').first().alias('open'),\n",
" pl.col('price').max().alias('high'),\n",
" pl.col('price').min().alias('low'),\n",
" pl.col('price').last().alias('close'),\n",
" pl.col('size').sum().alias('volume')\n",
" ]\n",
" )\n",
" .collect() # 遅延評価 実行\n",
" )"
],
"metadata": {
"id": "G1RWO-DzYlbL"
},
"execution_count": 76,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%lprun -u 1e-3 -f profile_pandas2 profile_pandas2(FILE_PATH)"
],
"metadata": {
"id": "QBFuSb0ovHNl"
},
"execution_count": 73,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%lprun -u 1e-3 -f profile_polars2 profile_polars2(FILE_PATH)"
],
"metadata": {
"id": "a83ff_RZvH0F"
},
"execution_count": 74,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment