Created
October 10, 2021 00:55
-
-
Save nogawanogawa/d231e11a418d44dafab610bfdd278467 to your computer and use it in GitHub Desktop.
bias_simulation_sample.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "bias_simulation_sample.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyMqfqCvlq9BP9F7Yr8wvH91", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/nogawanogawa/d231e11a418d44dafab610bfdd278467/bias_simulation_sample.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4Gi-po9aa4DA" | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy import stats\n", | |
"from sklearn.metrics import precision_score " | |
], | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "XphYUBCBa5tM" | |
}, | |
"source": [ | |
"np.random.seed(seed=42)" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "YoMGcc3da-3N" | |
}, | |
"source": [ | |
"## Step1\n", | |
"ランダムにランキングが生成され、それをランダムにクリックされたときのログを生成する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bIBHjGLQa9NU" | |
}, | |
"source": [ | |
"# アルファベット20文字をアイテムプールとする\n", | |
"item_list = ['A', 'B', 'C', 'D', 'E', \n", | |
" 'F', 'G', 'H', 'I', 'J']\n", | |
"\n", | |
"T = 100000" | |
], | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 419 | |
}, | |
"id": "Y_VBURBjbEsc", | |
"outputId": "cadc1dd4-7b53-4207-a55c-72e4d1ea172a" | |
}, | |
"source": [ | |
"J = pd.DataFrame({'item_j' : np.random.choice(item_list, T), 'click': np.random.binomial(1,0.2,size=T)})\n", | |
"\n", | |
"#check log\n", | |
"J" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>item_j</th>\n", | |
" <th>click</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>G</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>D</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>H</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>E</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>G</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99995</th>\n", | |
" <td>E</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99996</th>\n", | |
" <td>D</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99997</th>\n", | |
" <td>A</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99998</th>\n", | |
" <td>D</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99999</th>\n", | |
" <td>A</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100000 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" item_j click\n", | |
"0 G 0\n", | |
"1 D 1\n", | |
"2 H 1\n", | |
"3 E 0\n", | |
"4 G 0\n", | |
"... ... ...\n", | |
"99995 E 0\n", | |
"99996 D 0\n", | |
"99997 A 0\n", | |
"99998 D 0\n", | |
"99999 A 0\n", | |
"\n", | |
"[100000 rows x 2 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_LqXHTlybGo7" | |
}, | |
"source": [ | |
"## Step2\n", | |
"Step1で作成されたログについて、適合率(Precision)を計算" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "yd5HDB2SbIPg", | |
"outputId": "f256a2dc-c6cf-4661-f1cf-a381cca542ad" | |
}, | |
"source": [ | |
"y_pred = J['click']\n", | |
"y_true = [1] * len(J)\n", | |
"\n", | |
"precision_score(y_pred, y_true) #予測したもののうち、クリックされた回数の割合" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.20022" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Cb_LJbJmbPbU" | |
}, | |
"source": [ | |
"## Step3\n", | |
"新しくランキングを作る" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 419 | |
}, | |
"id": "eltIZ0SabMaC", | |
"outputId": "de94dedb-692b-47f8-e32f-07417214d5a5" | |
}, | |
"source": [ | |
"J_ = pd.DataFrame({'item_k' : np.random.choice(item_list, T)})\n", | |
"\n", | |
"#check log\n", | |
"J_" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>item_k</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99995</th>\n", | |
" <td>J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99996</th>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99997</th>\n", | |
" <td>J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99998</th>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99999</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100000 rows × 1 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" item_k\n", | |
"0 H\n", | |
"1 B\n", | |
"2 F\n", | |
"3 C\n", | |
"4 H\n", | |
"... ...\n", | |
"99995 J\n", | |
"99996 F\n", | |
"99997 J\n", | |
"99998 F\n", | |
"99999 C\n", | |
"\n", | |
"[100000 rows x 1 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Pt2MIrxdbVo1" | |
}, | |
"source": [ | |
"## Step4\n", | |
"Step1で作ったログとStep3で作ったものについて適合率(Precision)を計算" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 419 | |
}, | |
"id": "-sMkCVcpbWDV", | |
"outputId": "ad24df4a-cbbf-43ee-fbf6-eacb8753140e" | |
}, | |
"source": [ | |
"df = pd.concat([J, J_], axis=1)\n", | |
"\n", | |
"## ログに存在しないケースはすべてクリックがなかったものとする\n", | |
"ind = df[df[\"item_j\"] != df[\"item_k\"]].index\n", | |
"df.loc[ind, \"click\"] = 0\n", | |
"df" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>item_j</th>\n", | |
" <th>click</th>\n", | |
" <th>item_k</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>G</td>\n", | |
" <td>0</td>\n", | |
" <td>H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>D</td>\n", | |
" <td>0</td>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>H</td>\n", | |
" <td>0</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>E</td>\n", | |
" <td>0</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>G</td>\n", | |
" <td>0</td>\n", | |
" <td>H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99995</th>\n", | |
" <td>E</td>\n", | |
" <td>0</td>\n", | |
" <td>J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99996</th>\n", | |
" <td>D</td>\n", | |
" <td>0</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99997</th>\n", | |
" <td>A</td>\n", | |
" <td>0</td>\n", | |
" <td>J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99998</th>\n", | |
" <td>D</td>\n", | |
" <td>0</td>\n", | |
" <td>F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99999</th>\n", | |
" <td>A</td>\n", | |
" <td>0</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>100000 rows × 3 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" item_j click item_k\n", | |
"0 G 0 H\n", | |
"1 D 0 B\n", | |
"2 H 0 F\n", | |
"3 E 0 C\n", | |
"4 G 0 H\n", | |
"... ... ... ...\n", | |
"99995 E 0 J\n", | |
"99996 D 0 F\n", | |
"99997 A 0 J\n", | |
"99998 D 0 F\n", | |
"99999 A 0 C\n", | |
"\n", | |
"[100000 rows x 3 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "vnc95ysrcyka", | |
"outputId": "7121a314-df43-4ce6-e838-0dda5d72095f" | |
}, | |
"source": [ | |
"y_pred = df['click']\n", | |
"y_true = [1] * len(J)\n", | |
"\n", | |
"precision_score(y_pred, y_true) #予測したもののうち、クリックされた回数の割合" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.02018" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "boxvUqU_c1AI" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 8, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment