Skip to content

Instantly share code, notes, and snippets.

@ita9naiwa
Last active December 12, 2021 15:16
Show Gist options
  • Save ita9naiwa/1999469f0ccbc9e4fef790fa51504b98 to your computer and use it in GitHub Desktop.
Save ita9naiwa/1999469f0ccbc9e4fef790fa51504b98 to your computer and use it in GitHub Desktop.
alpha-beta-NDCG
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ee9fbdce",
"metadata": {},
"outputs": [],
"source": [
"def rowwise_norm(arr, norm='mean'):\n",
" newmat = []\n",
" for i in range(arr.shape[0]):\n",
" a = arr[i].astype(np.float32)\n",
" newmat.append(a / (1e-10 + a.sum()))\n",
" return np.asarray(newmat).astype(np.float32)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f758b56e",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.sparse import coo_matrix, csr_matrix\n",
"from implicit import evaluation"
]
},
{
"cell_type": "markdown",
"id": "538195b2",
"metadata": {},
"source": [
"### Data Preparing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b756a329",
"metadata": {},
"outputs": [],
"source": [
"ui_mat = pd.read_csv(\"data/ml-1m/ratings.dat\", sep='::', engine='python', encoding='ISO-8859-1', header=None).to_numpy().astype(int)\n",
"u = ui_mat[:, 0] - 1\n",
"i = ui_mat[:, 1] - 1\n",
"r = ui_mat[:, 2]\n",
"ui_mat = csr_matrix((r, (u, i,)))\n",
"ui_mat = ui_mat >= 4\n",
"ui_mat.eliminate_zeros()\n",
"ui_mat.data[:] = 1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9eefcb4e",
"metadata": {},
"outputs": [],
"source": [
"tr, te = evaluation.train_test_split(ui_mat, 0.5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4904684b",
"metadata": {},
"outputs": [],
"source": [
"n_users, n_items = tr.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8ba29d2d",
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv(\"data/ml-1m/movies.dat\", sep='::', engine='python', encoding='ISO-8859-1', header=None)\n",
"genres.columns = ['id', 'title', 'genre']\n",
"genres = {x: y.strip().split('|') for (x, y) in zip(genres['id'], genres['genre'])}\n",
"\n",
"unique_genres = set()\n",
"for i, g in genres.items():\n",
" unique_genres |= set(g)\n",
"genre_map = {g:i for (i, g) in enumerate(unique_genres)}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "bf67112b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Drama', 'Thriller']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres[i]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5a786fc0",
"metadata": {},
"outputs": [],
"source": [
"genre_mat = []\n",
"for i in range(n_items):\n",
" j = i + 1\n",
" u = np.zeros(len(unique_genres))\n",
" if j in genres:\n",
" for k in genres[j]:\n",
" u[genre_map[k]] = 1\n",
" genre_mat.append(u)\n",
"genre_mat = np.asarray(genre_mat)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "bb4a8508",
"metadata": {},
"outputs": [],
"source": [
"num_topics = genre_mat.shape[1]\n",
"genre_mat = genre_mat.astype(np.int32)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7d88b012",
"metadata": {},
"outputs": [],
"source": [
"res = tr * genre_mat\n",
"user_phi_dist = rowwise_norm(res)\n",
"genre_csr = csr_matrix(genre_mat)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "55c686bc",
"metadata": {},
"outputs": [],
"source": [
"def get_gain(test_item_list, uid_true, genre_csr, user_pref, alpha=0.05, beta=0.99):\n",
" l = len(test_item_list)\n",
" num_topics = genre_csr.shape[1]\n",
" topic_array = np.zeros_like(num_topics)\n",
" rho = np.zeros(num_topics)\n",
" tau = np.zeros(num_topics)\n",
" gains = []\n",
" for k in range(l):\n",
" iid = test_item_list[k]\n",
" hit = int(iid in uid_true)\n",
" p = 1\n",
" for c in genre_csr[iid].indices:\n",
" P_a_u_i = (1 - hit) * alpha + hit * beta\n",
" p *= (1 - P_a_u_i * user_pref[c] * ((1 - alpha) ** tau[c]) * ((1 - beta) ** rho[c]))\n",
"# p *= (1 - P_a_u_i * user_pref[c] * np.max(0.0, (1.0 - tau[c] * alpha)) * ((1 - beta) ** rho[c]))\n",
" tau[c] += 1\n",
" rho[c] += hit\n",
" gain = 1.0 - p\n",
" gains.append(gain)\n",
" return gains\n",
"\n",
"def get_ideal_order(test_item_list, uid_true, genre_csr, user_pref):\n",
" _test_item_list = np.copy(test_item_list).tolist()\n",
"\n",
" k = len(_test_item_list)\n",
" topic_array = np.zeros(num_topics)\n",
" ideal_list = []\n",
" for i in range(k):\n",
" target_item_list = [x for x in _test_item_list if x in uid_true]\n",
" if len(target_item_list) == 0:\n",
" target_item_list = _test_item_list\n",
" \n",
" scores = []\n",
" for iid in target_item_list:\n",
" score = (genre_csr[iid] * user_pref).sum() - (0.0 / k) * (genre_csr[iid] * topic_array).sum()\n",
" scores.append(score)\n",
" \n",
" chosen = np.argmax(scores)\n",
" chosen_item = target_item_list[chosen]\n",
" ideal_list.append(chosen_item)\n",
" topic_array = topic_array + np.asarray(genre_csr[chosen_item].todense()).ravel()\n",
" _test_item_list = [x for x in _test_item_list if x != chosen_item]\n",
" return ideal_list"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7dd47079",
"metadata": {},
"outputs": [],
"source": [
"def get_ideal_order_fast(test_item_list, uid_true, genre_csr, user_pref):\n",
" _test_item_list = np.copy(test_item_list)\n",
"\n",
" k = len(_test_item_list)\n",
" topic_array = np.zeros(num_topics)\n",
" ideal_list = []\n",
" hit = np.array([100 * (x in uid_true) for x in _test_item_list])\n",
" score = hit + (genre_csr[_test_item_list] * user_pref)\n",
" o = np.argsort(-score)\n",
" return _test_item_list[o]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "653b59af",
"metadata": {},
"outputs": [],
"source": [
"def ab_ndcg(uid, test_item_list, te_mat, genre_csr, user_phi_dist):\n",
" K = len(test_item_list)\n",
" uid_true = te_mat[uid].indices\n",
" gains = get_gain(test_item_list, uid_true, genre_csr, user_phi_dist[uid])\n",
" ideal_order = get_ideal_order_fast(test_item_list, uid_true, genre_csr, user_phi_dist[uid])\n",
" ideal_gains = get_gain(ideal_order, uid_true, genre_csr, user_phi_dist[uid])\n",
" U = np.sum(gains * (1 / np.log2(1 + np.arange(1, 1 + K))))\n",
" D = np.sum(ideal_gains * (1 / np.log2(1 + np.arange(1, 1 + K))))\n",
" return min(1, U / (1e-10 + D))\n",
"# return U"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1bb7e11d",
"metadata": {},
"outputs": [],
"source": [
"from implicit.als import AlternatingLeastSquares as ALS\n",
"from implicit.bpr import BayesianPersonalizedRanking as BPR"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "d1448f0e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading\n"
]
}
],
"source": [
"model = ALS()\n",
"bpr = BPR()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b847777d",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "aa9a1deb3526421385a95e84ce0ee590",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/15 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b9483b905f09498090bcf571042ba856",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/100 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model.fit(tr.T * 10, )\n",
"bpr.fit(tr.T,)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "527fe5ea",
"metadata": {},
"outputs": [],
"source": [
"n_users, n_items = tr.shape"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "01b0fd48",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c08b02f378ac4827927a2b18455fdca4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6040 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"user_target_list = []\n",
"for u in tqdm(range(n_users)):\n",
" a = [x for x in np.random.choice(n_items, 100, replace=False) if x not in tr[u].indices]\n",
" a += np.random.choice(te[u].indices, min(3, len(te[u].indices)), replace=False).tolist()\n",
" user_target_list.append(a)"
]
},
{
"cell_type": "markdown",
"id": "c9a3d8fb",
"metadata": {},
"source": [
"## Random Recommendation"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8ac87c68",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "76763aafc95742ca9f2d876df8af46a8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6040 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.utils import shuffle\n",
"ndcgs = []\n",
"for uid in tqdm(range(n_users)):\n",
" K = 10\n",
" rec = shuffle(user_target_list[uid])\n",
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n",
" ndcgs.append(ndcg)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c5022d49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.38296358741994113"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(ndcgs)"
]
},
{
"cell_type": "markdown",
"id": "44debbae",
"metadata": {},
"source": [
"## ALS"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "bed7001b",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f539f2449c31431284e589715c718ea5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6040 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0.7467418337430999"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from tqdm.auto import tqdm\n",
"ndcgs = []\n",
"for uid in tqdm(range(n_users)):\n",
" rec = [x[0] for x in model.rank_items(uid, tr, user_target_list[uid])]\n",
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n",
" ndcgs.append(ndcg)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "c8af11e3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7467418337430999"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(ndcgs)"
]
},
{
"cell_type": "markdown",
"id": "d5d6da52",
"metadata": {},
"source": [
"## BPRMF"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86fa474e",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ba609d3a3c564d1e815fe1b138e75905",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/6040 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from tqdm.auto import tqdm\n",
"ndcgs = []\n",
"for uid in tqdm(range(n_users)):\n",
" rec = [x[0] for x in bpr.rank_items(uid, tr, user_target_list[uid])]\n",
" ndcg = ab_ndcg(uid, rec, te, genre_csr, user_phi_dist)\n",
" ndcgs.append(ndcg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5e4cac4b",
"metadata": {},
"outputs": [],
"source": [
"np.mean(ndcgs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment