nogawanogawa/bias_simulation.ipynb

## bias_simulation.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "bias_simulation.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyOhGVoyS3vliEjgl7uwWbII",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/nogawanogawa/4127b0cd42a34c375f1db017d13442c6/bias_simulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Tr5Ysfb0x0kx"
      },
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "from scipy import stats"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1UQ0y5h4zrBz"
      },
      "source": [
        "np.random.seed(seed=42)"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nxE7FCekjpb8"
      },
      "source": [
        "def dcg(gain, k=None):\n",
        "    \"\"\" calc dcg value \"\"\" \n",
        "    if k is None:\n",
        "        k = gain.shape[0]\n",
        "\n",
        "    ret = gain[0]\n",
        "    for i in range(1, k):\n",
        "        ret += gain[i] / np.log2(i + 1)\n",
        "    return ret\n",
        "\n",
        "\n",
        "def ndcg(y, k=None, powered=False) -> float:\n",
        "    \"\"\" calc nDCG value \"\"\"\n",
        "\n",
        "    dcg_score = dcg(y, k=k)\n",
        "\n",
        "    ideal_sorted_scores = np.sort(y)[::-1]\n",
        "    ideal_dcg_score = dcg(ideal_sorted_scores, k=k)\n",
        "    \n",
        "    if ideal_dcg_score == 0: # 表示されたが１度もクリックされない場合にはnDCGは0\n",
        "        return 0.0\n",
        "    else :\n",
        "        return dcg_score / ideal_dcg_score\n"
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FT6hGm5bYkBm"
      },
      "source": [
        "## Step1\n",
        "ランダムにランキングが生成され、それをランダムにクリックされたときのログを生成する"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sF5O0ZmEyPfy"
      },
      "source": [
        "# アルファベット20文字をアイテムプールとする\n",
        "item_list = ['A', 'B', 'C', 'D', 'E', \n",
        "             'F', 'G', 'H', 'I', 'J', \n",
        "             'K', 'L', 'M', 'N', 'O',\n",
        "             'P', 'Q', 'R', 'S', 'T']\n",
        "\n",
        "num_users = 1000"
      ],
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SGyS01sSPxZt",
        "outputId": "f6185393-5a48-46b2-f147-ebbd31123ae1"
      },
      "source": [
        "user_log = []\n",
        "\n",
        "for i in range(num_users):\n",
        "    df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False), 'click': np.random.binomial(1,0.2,size=10)})\n",
        "    user_log.append(df)\n",
        "\n",
        "#check log\n",
        "print(user_log[1])"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  item  click\n",
            "0    S      0\n",
            "1    K      0\n",
            "2    H      0\n",
            "3    M      0\n",
            "4    J      0\n",
            "5    F      0\n",
            "6    R      1\n",
            "7    A      0\n",
            "8    D      0\n",
            "9    B      0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5imGnHW3ZaQt"
      },
      "source": [
        "## Step2\n",
        "Step１で作成されたランキングについて、nDCGを計算"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VEjoNuiDZKuy"
      },
      "source": [
        "scores = []\n",
        "for log in user_log:\n",
        "    scores.append(ndcg(log['click']))\n"
      ],
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "i5_v7-MCkV02",
        "outputId": "109a539d-18fe-4728-aec4-9c78d16b7096"
      },
      "source": [
        "ndcg_score = sum(scores) / num_users\n",
        "ndcg_score"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.4969310364380767"
            ]
          },
          "metadata": {},
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i821AM01XTL3"
      },
      "source": [
        ""
      ],
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W8VvOQA5ewkr"
      },
      "source": [
        "## Step3\n",
        "新しくランキングを作る"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7O8jfct_0JJm",
        "outputId": "14097cea-60f8-405d-fb5d-4c2938260319"
      },
      "source": [
        "user_log_new = []\n",
        "\n",
        "for i in range(num_users):\n",
        "    df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False)})\n",
        "    user_log_new.append(df)\n",
        "\n",
        "#check log\n",
        "print(user_log_new[1])"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  item\n",
            "0    T\n",
            "1    G\n",
            "2    O\n",
            "3    N\n",
            "4    F\n",
            "5    C\n",
            "6    M\n",
            "7    P\n",
            "8    B\n",
            "9    L\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-K_ptf-6iXwk"
      },
      "source": [
        "## Step4\n",
        "Step1で作ったログとStep3で作ったランキングを組み合わせてnDCGを計算"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tcmI6GaH3upk"
      },
      "source": [
        "scores_new = []\n",
        "\n",
        "for i in range(num_users):\n",
        "    df = user_log_new[i]\n",
        "    df_ = user_log[i]\n",
        "    df = pd.merge(df, df_, on=\"item\", how='left').fillna(0)\n",
        "    scores_new.append(ndcg(ｄｆ['click']))\n"
      ],
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "J93E4anFkR6U",
        "outputId": "5d16f83f-99fa-49ab-9b17-401916ab2658"
      },
      "source": [
        "ndcg_score_new = sum(scores_new) / num_users\n",
        "ndcg_score_new"
      ],
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.331659871450936"
            ]
          },
          "metadata": {},
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eHWeYxzecKti"
      },
      "source": [
        "## Step5\n",
        "一応t検定で有意差があるか確認してみる"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mckN_Hy0cI-H",
        "outputId": "b40904ba-1c27-46cc-dc36-88645615878e"
      },
      "source": [
        "stats.ttest_ind(scores, scores_new, equal_var=False)"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Ttest_indResult(statistic=12.71863865873347, pvalue=1.148335352312263e-35)"
            ]
          },
          "metadata": {},
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GFQMjBDLcMMd"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "bias_simulation.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyOhGVoyS3vliEjgl7uwWbII",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/nogawanogawa/4127b0cd42a34c375f1db017d13442c6/bias_simulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Tr5Ysfb0x0kx"
	},
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from scipy import stats"
	],
	"execution_count": 11,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "1UQ0y5h4zrBz"
	},
	"source": [
	"np.random.seed(seed=42)"
	],
	"execution_count": 12,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "nxE7FCekjpb8"
	},
	"source": [
	"def dcg(gain, k=None):\n",
	" \"\"\" calc dcg value \"\"\" \n",
	" if k is None:\n",
	" k = gain.shape[0]\n",
	"\n",
	" ret = gain[0]\n",
	" for i in range(1, k):\n",
	" ret += gain[i] / np.log2(i + 1)\n",
	" return ret\n",
	"\n",
	"\n",
	"def ndcg(y, k=None, powered=False) -> float:\n",
	" \"\"\" calc nDCG value \"\"\"\n",
	"\n",
	" dcg_score = dcg(y, k=k)\n",
	"\n",
	" ideal_sorted_scores = np.sort(y)[::-1]\n",
	" ideal_dcg_score = dcg(ideal_sorted_scores, k=k)\n",
	" \n",
	" if ideal_dcg_score == 0: # 表示されたが１度もクリックされない場合にはnDCGは0\n",
	" return 0.0\n",
	" else :\n",
	" return dcg_score / ideal_dcg_score\n"
	],
	"execution_count": 13,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "FT6hGm5bYkBm"
	},
	"source": [
	"## Step1\n",
	"ランダムにランキングが生成され、それをランダムにクリックされたときのログを生成する"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "sF5O0ZmEyPfy"
	},
	"source": [
	"# アルファベット20文字をアイテムプールとする\n",
	"item_list = ['A', 'B', 'C', 'D', 'E', \n",
	" 'F', 'G', 'H', 'I', 'J', \n",
	" 'K', 'L', 'M', 'N', 'O',\n",
	" 'P', 'Q', 'R', 'S', 'T']\n",
	"\n",
	"num_users = 1000"
	],
	"execution_count": 14,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "SGyS01sSPxZt",
	"outputId": "f6185393-5a48-46b2-f147-ebbd31123ae1"
	},
	"source": [
	"user_log = []\n",
	"\n",
	"for i in range(num_users):\n",
	" df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False), 'click': np.random.binomial(1,0.2,size=10)})\n",
	" user_log.append(df)\n",
	"\n",
	"#check log\n",
	"print(user_log[1])"
	],
	"execution_count": 15,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	" item click\n",
	"0 S 0\n",
	"1 K 0\n",
	"2 H 0\n",
	"3 M 0\n",
	"4 J 0\n",
	"5 F 0\n",
	"6 R 1\n",
	"7 A 0\n",
	"8 D 0\n",
	"9 B 0\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "5imGnHW3ZaQt"
	},
	"source": [
	"## Step2\n",
	"Step１で作成されたランキングについて、nDCGを計算"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "VEjoNuiDZKuy"
	},
	"source": [
	"scores = []\n",
	"for log in user_log:\n",
	" scores.append(ndcg(log['click']))\n"
	],
	"execution_count": 16,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "i5_v7-MCkV02",
	"outputId": "109a539d-18fe-4728-aec4-9c78d16b7096"
	},
	"source": [
	"ndcg_score = sum(scores) / num_users\n",
	"ndcg_score"
	],
	"execution_count": 17,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"0.4969310364380767"
	]
	},
	"metadata": {},
	"execution_count": 17
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "i821AM01XTL3"
	},
	"source": [
	""
	],
	"execution_count": 7,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "W8VvOQA5ewkr"
	},
	"source": [
	"## Step3\n",
	"新しくランキングを作る"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "7O8jfct_0JJm",
	"outputId": "14097cea-60f8-405d-fb5d-4c2938260319"
	},
	"source": [
	"user_log_new = []\n",
	"\n",
	"for i in range(num_users):\n",
	" df = pd.DataFrame({'item' : np.random.choice(item_list, 10, replace = False)})\n",
	" user_log_new.append(df)\n",
	"\n",
	"#check log\n",
	"print(user_log_new[1])"
	],
	"execution_count": 18,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	" item\n",
	"0 T\n",
	"1 G\n",
	"2 O\n",
	"3 N\n",
	"4 F\n",
	"5 C\n",
	"6 M\n",
	"7 P\n",
	"8 B\n",
	"9 L\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "-K_ptf-6iXwk"
	},
	"source": [
	"## Step4\n",
	"Step1で作ったログとStep3で作ったランキングを組み合わせてnDCGを計算"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "tcmI6GaH3upk"
	},
	"source": [
	"scores_new = []\n",
	"\n",
	"for i in range(num_users):\n",
	" df = user_log_new[i]\n",
	" df_ = user_log[i]\n",
	" df = pd.merge(df, df_, on=\"item\", how='left').fillna(0)\n",
	" scores_new.append(ndcg(ｄｆ['click']))\n"
	],
	"execution_count": 19,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "J93E4anFkR6U",
	"outputId": "5d16f83f-99fa-49ab-9b17-401916ab2658"
	},
	"source": [
	"ndcg_score_new = sum(scores_new) / num_users\n",
	"ndcg_score_new"
	],
	"execution_count": 20,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"0.331659871450936"
	]
	},
	"metadata": {},
	"execution_count": 20
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "eHWeYxzecKti"
	},
	"source": [
	"## Step5\n",
	"一応t検定で有意差があるか確認してみる"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "mckN_Hy0cI-H",
	"outputId": "b40904ba-1c27-46cc-dc36-88645615878e"
	},
	"source": [
	"stats.ttest_ind(scores, scores_new, equal_var=False)"
	],
	"execution_count": 21,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"Ttest_indResult(statistic=12.71863865873347, pvalue=1.148335352312263e-35)"
	]
	},
	"metadata": {},
	"execution_count": 21
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "GFQMjBDLcMMd"
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}