Created
October 9, 2021 23:43
-
-
Save nogawanogawa/4127b0cd42a34c375f1db017d13442c6 to your computer and use it in GitHub Desktop.
bias_simulation.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "bias_simulation.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyOhGVoyS3vliEjgl7uwWbII", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/nogawanogawa/4127b0cd42a34c375f1db017d13442c6/bias_simulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Tr5Ysfb0x0kx" | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy import stats" | |
], | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1UQ0y5h4zrBz" | |
}, | |
"source": [ | |
"np.random.seed(seed=42)" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nxE7FCekjpb8" | |
}, | |
"source": [ | |
"def dcg(gain, k=None):\n", | |
" \"\"\" calc dcg value \"\"\" \n", | |
" if k is None:\n", | |
" k = gain.shape[0]\n", | |
"\n", | |
" ret = gain[0]\n", | |
" for i in range(1, k):\n", | |
" ret += gain[i] / np.log2(i + 1)\n", | |
" return ret\n", | |
"\n", | |
"\n", | |
"def ndcg(y, k=None, powered=False) -> float:\n", | |
" \"\"\" calc nDCG value \"\"\"\n", | |
"\n", | |
" dcg_score = dcg(y, k=k)\n", | |
"\n", | |
" ideal_sorted_scores = np.sort(y)[::-1]\n", | |
" ideal_dcg_score = dcg(ideal_sorted_scores, k=k)\n", | |
" \n", | |
" if ideal_dcg_score == 0: # 表示されたが1度もクリックされない場合にはnDCGは0\n", | |
" return 0.0\n", | |
" else :\n", | |
" return dcg_score / ideal_dcg_score\n" | |
], | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "FT6hGm5bYkBm" | |
}, | |
"source": [ | |
"## Step1\n", | |
"ランダムにランキングが生成され、それをランダムにクリックされたときのログを生成する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "sF5O0ZmEyPfy" | |
}, | |
"source": [ | |
"# アルファベット20文字をアイテムプールとする\n", | |
"item_list = ['A', 'B', 'C', 'D', 'E', \n", | |
" 'F', 'G', 'H', 'I', 'J', \n", | |
" 'K', 'L', 'M', 'N', 'O',\n", | |
" 'P', 'Q', 'R', 'S', 'T']\n", | |
"\n", | |
"num_users = 1000" | |
], | |
"execution_count": 14, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "SGyS01sSPxZt", | |
"outputId": "f6185393-5a48-46b2-f147-ebbd31123ae1" | |
}, | |
"source": [ | |
"user_log = []\n", | |
"\n", | |
"for i in range(num_users):\n", | |
" df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False), 'click': np.random.binomial(1,0.2,size=10)})\n", | |
" user_log.append(df)\n", | |
"\n", | |
"#check log\n", | |
"print(user_log[1])" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
" item click\n", | |
"0 S 0\n", | |
"1 K 0\n", | |
"2 H 0\n", | |
"3 M 0\n", | |
"4 J 0\n", | |
"5 F 0\n", | |
"6 R 1\n", | |
"7 A 0\n", | |
"8 D 0\n", | |
"9 B 0\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5imGnHW3ZaQt" | |
}, | |
"source": [ | |
"## Step2\n", | |
"Step1で作成されたランキングについて、nDCGを計算" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VEjoNuiDZKuy" | |
}, | |
"source": [ | |
"scores = []\n", | |
"for log in user_log:\n", | |
" scores.append(ndcg(log['click']))\n" | |
], | |
"execution_count": 16, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "i5_v7-MCkV02", | |
"outputId": "109a539d-18fe-4728-aec4-9c78d16b7096" | |
}, | |
"source": [ | |
"ndcg_score = sum(scores) / num_users\n", | |
"ndcg_score" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.4969310364380767" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "i821AM01XTL3" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "W8VvOQA5ewkr" | |
}, | |
"source": [ | |
"## Step3\n", | |
"新しくランキングを作る" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "7O8jfct_0JJm", | |
"outputId": "14097cea-60f8-405d-fb5d-4c2938260319" | |
}, | |
"source": [ | |
"user_log_new = []\n", | |
"\n", | |
"for i in range(num_users):\n", | |
" df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False)})\n", | |
" user_log_new.append(df)\n", | |
"\n", | |
"#check log\n", | |
"print(user_log_new[1])" | |
], | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
" item\n", | |
"0 T\n", | |
"1 G\n", | |
"2 O\n", | |
"3 N\n", | |
"4 F\n", | |
"5 C\n", | |
"6 M\n", | |
"7 P\n", | |
"8 B\n", | |
"9 L\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "-K_ptf-6iXwk" | |
}, | |
"source": [ | |
"## Step4\n", | |
"Step1で作ったログとStep3で作ったランキングを組み合わせてnDCGを計算" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tcmI6GaH3upk" | |
}, | |
"source": [ | |
"scores_new = []\n", | |
"\n", | |
"for i in range(num_users):\n", | |
" df = user_log_new[i]\n", | |
" df_ = user_log[i]\n", | |
" df = pd.merge(df, df_, on=\"item\", how='left').fillna(0)\n", | |
" scores_new.append(ndcg(df['click']))\n" | |
], | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "J93E4anFkR6U", | |
"outputId": "5d16f83f-99fa-49ab-9b17-401916ab2658" | |
}, | |
"source": [ | |
"ndcg_score_new = sum(scores_new) / num_users\n", | |
"ndcg_score_new" | |
], | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.331659871450936" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 20 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "eHWeYxzecKti" | |
}, | |
"source": [ | |
"## Step5\n", | |
"一応t検定で有意差があるか確認してみる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "mckN_Hy0cI-H", | |
"outputId": "b40904ba-1c27-46cc-dc36-88645615878e" | |
}, | |
"source": [ | |
"stats.ttest_ind(scores, scores_new, equal_var=False)" | |
], | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"Ttest_indResult(statistic=12.71863865873347, pvalue=1.148335352312263e-35)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 21 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GFQMjBDLcMMd" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment