Skip to content

Instantly share code, notes, and snippets.

@reachsumit
Created November 7, 2022 01:20
Show Gist options
  • Save reachsumit/2a639276fc781870c4dcd480a3417bf9 to your computer and use it in GitHub Desktop.
Save reachsumit/2a639276fc781870c4dcd480a3417bf9 to your computer and use it in GitHub Desktop.
Factorization Machine
Display the source blob
Display the rendered blob
Raw
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport torch.nn as nn\n\nfrom scipy.stats import rankdata","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-11-06T20:43:34.765194Z","iopub.execute_input":"2022-11-06T20:43:34.765873Z","iopub.status.idle":"2022-11-06T20:43:36.977772Z","shell.execute_reply.started":"2022-11-06T20:43:34.765734Z","shell.execute_reply":"2022-11-06T20:43:36.976808Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"device = 'cuda' if torch.cuda.is_available() else 'cpu'\nPAD_IDX = 0","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:36.979411Z","iopub.execute_input":"2022-11-06T20:43:36.979992Z","iopub.status.idle":"2022-11-06T20:43:37.045085Z","shell.execute_reply.started":"2022-11-06T20:43:36.979962Z","shell.execute_reply":"2022-11-06T20:43:37.044031Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# purpose: convert target with index of movie to series of all zeros and one in place of index\n# We will use this to compute the expected output of the model to be compared with actual output\ndef idx_to_sparse(idx, sparse_dim):\n sparse = np.zeros(sparse_dim) # vector of 1683 zeroes\n sparse[int(idx)] = 1 # set a given index to 1\n return pd.Series(sparse, dtype=int) # make a pandas series of 0s and 1s\n\n\n# Calculate accuracy (a classification metric)\ndef accuracy_fn(y_true, y_pred):\n correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal\n acc = (correct / len(y_pred)) * 100 \n return acc","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:37.047014Z","iopub.execute_input":"2022-11-06T20:43:37.047648Z","iopub.status.idle":"2022-11-06T20:43:37.068478Z","shell.execute_reply.started":"2022-11-06T20:43:37.047601Z","shell.execute_reply":"2022-11-06T20:43:37.067457Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"def load_and_process_data_fm():\n #Load the Ratings data\n data = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.data', sep=\"\\t\", header=None)\n data.columns = ['user id', 'movie id', 'rating', 'timestamp']\n #Load the User data\n users = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.user', sep=\"|\", encoding='latin-1', header=None)\n users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']\n #Load movie data\n items = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.item', \n sep=\"|\", encoding='latin-1', header=None)\n items.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', \n 'unknown', 'Action', 'Adventure', 'Animation', 'Children\\'s', 'Comedy', \n 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', \n 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n GENRES = pd.read_csv('../input/movielens-100k-dataset/ml-100k/u.genre', \n sep=\"|\", header=None, usecols=[0])[0].tolist()\n \n # Sort the dataset by user-id and time\n dataset = data.sort_values(['user id', 'timestamp']).reset_index(drop=True)\n dataset['one'] = 1 # add a column containing all 1s\n dataset['sample_num'] = dataset.groupby('user id')['one'].cumsum() # use the 1s column to create a sample number for each user\n # Create a target column by shifting movie-id for each user-id one step back, effectively this means that we have a column that has id for the next movie the user is going to watch \n # (it is NaN for the row representing the last movie the user watches). We will predict this column.\n dataset['target'] = dataset.groupby('user id')['movie id'].shift(-1)\n # create a column that represents average movie rating given by user till that time (represented by row)\n dataset['mean_rate'] = dataset.groupby('user id')['rating'].cumsum() / dataset['sample_num']\n \n # do a left join with movies dataframe and bring all the genre representations (0/1 binary values for each movie representing its category) here.\n dataset = dataset.merge(items[['movie id'] + GENRES], on='movie id', how='left')\n \n # For each genre column (19 total) creates another column (total 19 more). This column represents a given user's mean score (float value) for a given genre till that time (represented by row).\n # Note that we also update the genre columns such that each column now has cumulative sum, i.e. the corresponding number of movies that the user has watched in that genre so far.\n for genre in GENRES:\n dataset[f'{genre}_rate'] = dataset[genre]*dataset['rating']\n dataset[genre] = dataset.groupby('user id')[genre].cumsum()\n dataset[f'{genre}_rate'] = dataset.groupby('user id')[f'{genre}_rate'].cumsum() / dataset[genre]\n \n # Next we normalize the scores for movies in each genre such that we divide it by the number of movies that the user has watched so far.\n dataset[GENRES] = dataset[GENRES].apply(lambda x: x / dataset['sample_num'])\n # do a left-join on users data and get more information on users\n dataset = dataset.merge(users, on='user id', how='left')\n \n occupations_categoricals = dataset['occupation'].unique().tolist()\n\n dataset['gender'] = (dataset['gender'] == 'M').astype(int) # change gender to 0/1 integer\n dataset = pd.concat([dataset.drop(['occupation'], axis=1), pd.get_dummies(dataset['occupation'], prefix=\"\", prefix_sep=\"\")], axis=1) # get occupation dummy variables and drop occupation column\n dataset.drop('zip code', axis=1, inplace=True)\n \n COLD_START_TRESH = 5 # take the rows AFTER each user has watched at least 4 movies\n # filter using threshold and remove null target rows\n filtred_data = dataset[(dataset['sample_num'] >= COLD_START_TRESH) &\n ~(dataset['target'].isna())].sort_values('timestamp')\n \n continuous_cols = ['age', 'gender', 'mean_rate'] + GENRES + [gen+\"_rate\" for gen in GENRES] # 41\n categoricals = occupations_categoricals# already dummy encoded\n df_wide_without_cross = filtred_data[continuous_cols + categoricals]\n \n TEST_SIZE = 0.2 # size of test set\n X_train_wide_wo_cross, X_test_wide_wo_cross = df_wide_without_cross[:int(len(df_wide_without_cross)*(1-TEST_SIZE))], df_wide_without_cross[int(len(df_wide_without_cross)*(1-TEST_SIZE)):]\n\n filtered_train_data, filtered_test_data = filtred_data[:int(len(filtred_data)*(1-TEST_SIZE))], filtred_data[int(len(filtred_data)*(1-TEST_SIZE)):]\n y_train, y_test = filtered_train_data['target'], filtered_test_data['target']\n \n # target\n target_train = torch.Tensor(y_train.values).long().to(device)\n target_test = torch.Tensor(y_test.values).long().to(device)\n target_test_sparse = y_test.apply(lambda x: idx_to_sparse(x, items['movie id'].nunique() + 1)) # to calculate mean rank over test set during training\n \n # tensor with continuous features\n X_train_wide_wo_cross_tensor = torch.Tensor(X_train_wide_wo_cross.fillna(0).values).to(device)\n X_test_wide_wo_cross_tensor = torch.Tensor(X_test_wide_wo_cross.fillna(0).values).to(device)\n \n return X_train_wide_wo_cross_tensor, X_test_wide_wo_cross_tensor, target_train, target_test, target_test_sparse, items['movie id'].nunique() + 1\n\nclass FM(nn.Module):\n def __init__(self, input_dim, n_class, k=5):\n super().__init__()\n # Initially we fill V with random values sampled from Gaussian distribution\n self.V = nn.Parameter(torch.randn(input_dim, k),requires_grad=True)\n self.linear_layer = nn.Linear(input_dim, n_class, device=device)\n \n def forward(self, x):\n square_of_sum = torch.matmul(x, self.V.to(device)).pow(2).sum(1, keepdim=True) #S_1^2\n sum_of_square = torch.matmul(x.pow(2), self.V.to(device).pow(2)).sum(1, keepdim=True) # S_2\n \n out_inter = 0.5 * (square_of_sum - sum_of_square)\n out_lin = self.linear_layer(x)\n out = out_inter + out_lin\n \n return out\n\ndef run_gradient_descent_fm(model,\n learning_rate=1e-3,\n weight_decay=0.01,\n num_epochs=10):\n loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX) # the model doesn't need to predict padding index\n optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n \n iters, train_losses, test_losses, mean_test_ranks = [], [], [], []\n \n # training\n n = 0 # the number of iterations\n for epoch in range(num_epochs):\n model.train()\n y_logits = model(X_train_wide_wo_cross_tensor)\n loss_train = loss_fn(y_logits, target_train)\n\n # Backpropagation\n optimizer.zero_grad() # a clean up step for PyTorch\n loss_train.backward() # compute updates for each parameter\n optimizer.step() # make the updates for each parameter\n\n # save the current training information\n if n%100 == 0:\n pred_train = torch.softmax(y_logits, dim=1).argmax(dim=1)\n acc = accuracy_fn(y_true=target_train, y_pred=pred_train)\n \n model.eval()\n with torch.inference_mode():\n test_logits = model(X_test_wide_wo_cross_tensor)\n test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)\n loss_test = loss_fn(test_logits, target_test)\n test_acc = accuracy_fn(y_true=target_test,y_pred=test_pred)\n \n # calculate mean rank on test set\n softmax = nn.Softmax(dim=0)\n preds_wnd = softmax(test_logits.float()).cpu().detach().numpy()\n ranks_wnd = pd.DataFrame(preds_wnd).apply(lambda x: pd.Series(rankdata(-x)), axis=1)\n ranks_target_wnd = (ranks_wnd.values * target_test_sparse).sum(axis=1)\n mean_rank_wnd = ranks_target_wnd.mean()\n \n print(f\"Epoch: {epoch} | Loss: {loss_train:.5f}, Acc: {acc:.2f}% | Test Loss: {loss_test:.5f}, Test Acc: {test_acc:.2f}% Test mean rank: {mean_rank_wnd:.0f}\")\n \n iters.append(n)\n train_losses.append(float(loss_train))\n test_losses.append(float(loss_test))\n mean_test_ranks.append(mean_rank_wnd)\n \n # increment the iteration number\n n += 1\n \n # plotting\n plt.figure(figsize=(12, 8), dpi=100)\n plt.title(f\"Training Curve (lr={learning_rate})\")\n plt.plot(iters, train_losses, label=\"Train Loss\")\n plt.plot(iters, test_losses, label=\"Test Loss\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Loss\")\n plt.legend(loc='best')\n plt.show()\n \n plt.figure(figsize=(12, 8), dpi=100)\n plt.plot(iters, mean_test_ranks, label=\"Test Rank\")\n plt.xlabel(\"Iterations\")\n plt.ylabel(\"Mean Rank on testset\")\n plt.legend(loc='best')\n plt.show()\n \n return model, iters, train_losses, test_losses","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:37.072212Z","iopub.execute_input":"2022-11-06T20:43:37.072511Z","iopub.status.idle":"2022-11-06T20:43:37.104923Z","shell.execute_reply.started":"2022-11-06T20:43:37.072485Z","shell.execute_reply":"2022-11-06T20:43:37.103989Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"X_train_wide_wo_cross_tensor, X_test_wide_wo_cross_tensor, target_train, target_test, target_test_sparse,n_classes = load_and_process_data_fm()","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:37.106291Z","iopub.execute_input":"2022-11-06T20:43:37.106808Z","iopub.status.idle":"2022-11-06T20:43:47.576119Z","shell.execute_reply.started":"2022-11-06T20:43:37.106773Z","shell.execute_reply":"2022-11-06T20:43:47.575200Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"fm_model = FM(input_dim=X_train_wide_wo_cross_tensor.shape[1], n_class=n_classes)","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:47.577735Z","iopub.execute_input":"2022-11-06T20:43:47.578134Z","iopub.status.idle":"2022-11-06T20:43:47.586428Z","shell.execute_reply.started":"2022-11-06T20:43:47.578096Z","shell.execute_reply":"2022-11-06T20:43:47.585462Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"code","source":"fm_model_trained, iters, train_losses, test_losses = run_gradient_descent_fm(fm_model, num_epochs=1000, weight_decay=0, learning_rate=0.03)","metadata":{"execution":{"iopub.status.busy":"2022-11-06T20:43:47.588177Z","iopub.execute_input":"2022-11-06T20:43:47.588602Z","iopub.status.idle":"2022-11-06T20:45:34.783968Z","shell.execute_reply.started":"2022-11-06T20:43:47.588565Z","shell.execute_reply":"2022-11-06T20:45:34.783036Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"Epoch: 0 | Loss: 9.84589, Acc: 0.11% | Test Loss: 8.87657, Test Acc: 0.24% Test mean rank: 842\nEpoch: 100 | Loss: 6.04683, Acc: 1.47% | Test Loss: 6.94882, Test Acc: 0.73% Test mean rank: 842\nEpoch: 200 | Loss: 5.94247, Acc: 1.73% | Test Loss: 7.06234, Test Acc: 0.73% Test mean rank: 842\nEpoch: 300 | Loss: 5.89036, Acc: 1.84% | Test Loss: 7.16583, Test Acc: 0.72% Test mean rank: 842\nEpoch: 400 | Loss: 5.87478, Acc: 1.85% | Test Loss: 7.28036, Test Acc: 0.62% Test mean rank: 842\nEpoch: 500 | Loss: 5.88775, Acc: 1.79% | Test Loss: 7.41003, Test Acc: 0.60% Test mean rank: 842\nEpoch: 600 | Loss: 6.03344, Acc: 1.64% | Test Loss: 7.66681, Test Acc: 0.54% Test mean rank: 842\nEpoch: 700 | Loss: 6.16944, Acc: 1.46% | Test Loss: 7.89027, Test Acc: 0.54% Test mean rank: 842\nEpoch: 800 | Loss: 6.37742, Acc: 1.43% | Test Loss: 8.26573, Test Acc: 0.48% Test mean rank: 842\nEpoch: 900 | Loss: 6.65704, Acc: 1.26% | Test Loss: 8.66080, Test Acc: 0.55% Test mean rank: 842\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}},{"output_type":"display_data","data":{"text/plain":"<Figure size 1200x800 with 1 Axes>","image/png":"\n"},"metadata":{"needs_background":"light"}}]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment