Skip to content

Instantly share code, notes, and snippets.

@jbochi
Last active October 18, 2023 07:52
Show Gist options
  • Star 61 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save jbochi/2e8ddcc5939e70e5368326aa034a144e to your computer and use it in GitHub Desktop.
Save jbochi/2e8ddcc5939e70e5368326aa034a144e to your computer and use it in GitHub Desktop.
Recommending GitHub repositories with Google Big Query and implicit library: https://medium.com/@jbochi/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.sparse import coo_matrix\n",
"from implicit.als import AlternatingLeastSquares\n",
"from implicit.utils import nonzeros"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"project_id = \"CHANGEME\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requesting query... ok.\n",
"Query running...\n",
" Elapsed 11.25 s. Waiting...\n",
"Query done.\n",
"Processed: 9.4 GB\n",
"\n",
"Retrieving results...\n",
" Got page: 1; 46.0% done. Elapsed 24.05 s.\n",
" Got page: 2; 92.0% done. Elapsed 41.02 s.\n",
" Got page: 3; 100.0% done. Elapsed 49.03 s.\n",
"Got 217819 rows.\n",
"\n",
"Total time taken 50.26 s.\n",
"Finished at 2017-07-08 10:51:18.\n"
]
}
],
"source": [
"query = \"\"\"\n",
"WITH stars AS (\n",
" SELECT DISTINCT actor.login AS user, repo.name AS repo\n",
" FROM `githubarchive.month.2017*`\n",
" WHERE type=\"WatchEvent\"\n",
"),\n",
"repositories_stars AS (\n",
" SELECT repo, COUNT(*) as c FROM stars GROUP BY repo\n",
" ORDER BY c DESC\n",
" LIMIT 1000\n",
"),\n",
"users_stars AS (\n",
" SELECT user, COUNT(*) as c FROM stars\n",
" WHERE repo IN (SELECT repo FROM repositories_stars)\n",
" GROUP BY user\n",
" HAVING c >= 10 AND C <= 150\n",
" LIMIT 10000\n",
")\n",
"SELECT user, repo FROM stars\n",
"WHERE repo IN (SELECT repo FROM repositories_stars)\n",
"AND user IN (SELECT user FROM users_stars)\n",
"\"\"\"\n",
"\n",
"data = pd.io.gbq.read_gbq(query, dialect=\"standard\", project_id=project_id)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data['user'] = data['user'].astype(\"category\")\n",
"data['repo'] = data['repo'].astype(\"category\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):\n",
" def __init__(self, confidence=40):\n",
" self.confidence = confidence\n",
"\n",
" def fit(self, X, y=None):\n",
" return self\n",
"\n",
" def transform(self, X, y=None):\n",
" return coo_matrix((np.ones(X.shape[0]),\n",
" (X['repo'].cat.codes.copy(),\n",
" X['user'].cat.codes.copy()))) * self.confidence"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class ALSEstimator(BaseEstimator, TransformerMixin):\n",
" def __init__(self, factors=50,\n",
" regularization=0.01,\n",
" iterations=10,\n",
" filter_seen=True):\n",
" self.factors = factors\n",
" self.regularization = regularization\n",
" self.iterations = iterations\n",
" self.filter_seen = filter_seen\n",
"\n",
" def fit(self, X, y=None):\n",
" self.model = AlternatingLeastSquares(factors=self.factors,\n",
" regularization=self.regularization,\n",
" iterations=self.iterations,\n",
" dtype=np.float32,\n",
" use_native=True)\n",
" self.model.fit(X)\n",
" if self.filter_seen:\n",
" self.fit_X = X\n",
" return self\n",
" \n",
" def predict(self, X, y=None):\n",
" predictions = np.dot(self.model.item_factors, self.model.user_factors.T)\n",
" if self.filter_seen:\n",
" predictions[self.fit_X.nonzero()] = -99\n",
" return predictions\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#https://gist.github.com/mblondel/7337391\n",
"\n",
"def dcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
" \"\"\"Discounted cumulative gain (DCG) at rank k\n",
" Parameters\n",
" ----------\n",
" y_true : array-like, shape = [n_samples]\n",
" Ground truth (true relevance labels).\n",
" y_score : array-like, shape = [n_samples]\n",
" Predicted scores.\n",
" k : int\n",
" Rank.\n",
" gains : str\n",
" Whether gains should be \"exponential\" (default) or \"linear\".\n",
" Returns\n",
" -------\n",
" DCG @k : float\n",
" \"\"\"\n",
" order = np.argsort(y_score)[::-1]\n",
" y_true = np.take(y_true, order[:k])\n",
"\n",
" if gains == \"exponential\":\n",
" gains = 2 ** y_true - 1\n",
" elif gains == \"linear\":\n",
" gains = y_true\n",
" else:\n",
" raise ValueError(\"Invalid gains option.\")\n",
"\n",
" # highest rank is 1 so +2 instead of +1\n",
" discounts = np.log2(np.arange(len(y_true)) + 2)\n",
" return np.sum(gains / discounts)\n",
"\n",
"\n",
"def ndcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
" \"\"\"Normalized discounted cumulative gain (NDCG) at rank k\n",
" Parameters\n",
" ----------\n",
" y_true : array-like, shape = [n_samples]\n",
" Ground truth (true relevance labels).\n",
" y_score : array-like, shape = [n_samples]\n",
" Predicted scores.\n",
" k : int\n",
" Rank.\n",
" gains : str\n",
" Whether gains should be \"exponential\" (default) or \"linear\".\n",
" Returns\n",
" -------\n",
" NDCG @k : float\n",
" \"\"\"\n",
" best = dcg_score(y_true, y_true, k, gains)\n",
" if best == 0:\n",
" return 0 \n",
" actual = dcg_score(y_true, y_score, k, gains)\n",
" return actual / best"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_col(Y, col):\n",
" return np.squeeze(np.asarray(Y[:,col]))\n",
"\n",
"def ndcg_score_matrix(Y_true, Y_score, k=10, gains=\"exponential\"):\n",
" score = 0.0\n",
" n_users = Y_true.shape[1]\n",
" for u in range(n_users):\n",
" s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))\n",
" score += s\n",
" return score / n_users"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import PredefinedSplit\n",
"\n",
"class LeavePOutByGroup():\n",
" def __init__(self, X, p=5, n_splits=2):\n",
" self.X = X\n",
" self.p = p\n",
" self.n_splits = n_splits\n",
" test_fold = self.X.groupby(\"user\").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)\n",
" self.s = PredefinedSplit(test_fold)\n",
"\n",
" def get_n_splits(self, X=None, y=None, groups=None):\n",
" return self.n_splits\n",
"\n",
" def split(self, X=None, y=None, groups=None):\n",
" return self.s.split()\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ndcg_scorer(estimator, X_test):\n",
" truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()\n",
" predictions = estimator.predict(X_test)\n",
" return ndcg_score_matrix(truth, predictions, k=10)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"rec_pipeline = Pipeline([\n",
" ('matrix', UtilityMatrixTransformer()),\n",
" ('als', ALSEstimator()),\n",
"])\n",
"\n",
"param_grid = [\n",
" {\n",
" 'matrix__confidence': [1, 3, 10, 40, 100],\n",
" 'als__factors': [20, 50, 100],\n",
" 'als__regularization': [1e-2, 1e-3, 1e-4],\n",
" }\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 13.3min finished\n"
]
},
{
"data": {
"text/plain": [
"GridSearchCV(cv=<__main__.LeavePOutByGroup instance at 0x1211de440>,\n",
" error_score='raise',\n",
" estimator=Pipeline(memory=None,\n",
" steps=[('matrix', UtilityMatrixTransformer(confidence=40)), ('als', ALSEstimator(factors=50, filter_seen=True, iterations=10, regularization=0.01))]),\n",
" fit_params=None, iid=True, n_jobs=1,\n",
" param_grid=[{'matrix__confidence': [1, 3, 10, 40, 100], 'als__factors': [20, 50, 100], 'als__regularization': [0.01, 0.001, 0.0001]}],\n",
" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
" scoring=<function ndcg_scorer at 0x120f70500>, verbose=1)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values(\"user\")\n",
"grid_search = GridSearchCV(rec_pipeline, param_grid,\n",
" cv=LeavePOutByGroup(shuffled_train_set, p=5, n_splits=3),\n",
" scoring=ndcg_scorer, verbose=1)\n",
"grid_search.fit(shuffled_train_set)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid_search.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.12182770483269723, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.01})\n",
"(0.13166824473805691, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.01})\n",
"(0.13282873335943512, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.01})\n",
"(0.11039252745416286, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.01})\n",
"(0.089016691660527295, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.01})\n",
"(0.12093506965903911, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.001})\n",
"(0.13349651687119168, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.001})\n",
"(0.13296881851844261, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.001})\n",
"(0.11066298985479994, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.001})\n",
"(0.090079804327817939, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.001})\n",
"(0.12115380609913633, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.0001})\n",
"(0.13149519337094234, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.0001})\n",
"(0.1324632559745306, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.0001})\n",
"(0.11146953791233, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.0001})\n",
"(0.087150593455471825, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.0001})\n",
"(0.11191469606402717, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.01})\n",
"(0.13091750356245202, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.01})\n",
"(0.12765971686755495, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.01})\n",
"(0.10417408070456254, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.01})\n",
"(0.08646553233734848, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.01})\n",
"(0.11191591881133119, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.001})\n",
"(0.13101055863881378, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.001})\n",
"(0.12704689426232149, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.001})\n",
"(0.10546083019755575, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.001})\n",
"(0.088077539716670858, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.001})\n",
"(0.11183762637497301, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.0001})\n",
"(0.13123515836596611, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.0001})\n",
"(0.12755216556143176, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.0001})\n",
"(0.10545558137554324, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.0001})\n",
"(0.085301802039959324, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.0001})\n",
"(0.10611469337423451, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.01})\n",
"(0.12196884692068968, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.01})\n",
"(0.1175596768388132, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.01})\n",
"(0.097768165103126289, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.01})\n",
"(0.083236186967734632, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.01})\n",
"(0.10592410193883216, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.001})\n",
"(0.12342558447935369, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.001})\n",
"(0.11774383257111316, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.001})\n",
"(0.10114449211940411, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.001})\n",
"(0.085345035644795259, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.001})\n",
"(0.10588644986046256, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.0001})\n",
"(0.12292610723641109, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.0001})\n",
"(0.11897882932070754, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.0001})\n",
"(0.10299449602711824, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.0001})\n",
"(0.086555398366424466, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.0001})\n"
]
}
],
"source": [
"cvres = grid_search.cv_results_\n",
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
" print(mean_score, params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@stared
Copy link

stared commented Jun 25, 2017

This is wonderful, thank you @jbochi for sharing! Though, I would not that it needs:

pip install implicit
pip install --upgrade google-api-python-client

and creation in BigQuery a project with a name in project_id .

For the script, at least in Python 3 (except for iteritems -> items), recalculate_user=True does not work and needs to be removed, and for some reason model.explain does not exist (do you use the lastest version of implicit?).

@jbochi
Copy link
Author

jbochi commented Jul 3, 2017

Hey @stared. Thanks for your comments. Please make sure you are using the latest version of implicit. It adds recalculate_user=True and .explain support.

@juanbits
Copy link

hi @jbochi, i try to implement this with other dataset (user/items (fruits)/ratings) but only use user/items, i got some result but i dont know if the results are correct. I use this dataset https://github.com/juanremi/datasets-to-CF-recsys/blob/master/bigquery-frutas-sin-items-no-vistos.csv
And the code is here: https://github.com/juanremi/datasets-to-CF-recsys/blob/master/bigquery/bigquery2.py

can u gime some ideas? thanks

@jbochi
Copy link
Author

jbochi commented Aug 22, 2017

Hi @juanremi. Sorry for the late reply. Your code looks fine, but 50 factors for a dataset so small is probably way too much. You should do some validation to make sure the results make sense. You can, for instance, compute the mean squared error for ratings in the validation set.

@ptiwaree
Copy link

Hi jbochi, in your recommendations.ipynb, could you please explain this part?

confidence = 40
model.fit(confidence * stars)

I am not sure why we are setting confidence to 40 and what confidence even means? Based on this, it looks like you are filling in the matrix with values of 40 for the items that are not sparse. Is that true? Can we instead fill it with actual number of stars (a number between 10 and 150)?

@jbochi
Copy link
Author

jbochi commented Feb 9, 2018

@ptiwaree stars is a sparse matrix with 0s and 1s. By multiplying by confidence, we are just giving positive examples a fixed high weight. The best value should be determined by cross-validation. You can also try this heuristic: benfred/implicit#74 (comment)

@antonioalegria
Copy link

antonioalegria commented Jun 14, 2018

Hi @jbochi , i'm trying to use this code to evaluate on another dataset but I'm getting a bunch of Index out of bounds errors because the factors in the train data are not the same as in the test data. This is probably because the Users/Items in train are different than the ones seen in test.

How would you adapt this code to face this challenge? Or is my theory incorrect? /cc @antonioalegria

@joddm
Copy link

joddm commented Nov 2, 2018

I also have problems with index out of bounds, did you figure it out @antonioalegria?

Do you know what version of the libraries you ran this with @jbochi? Running now with

pandas: 0.23.4
numpy: 1.15.2
scipy: 1.1.0
implicit: 0.3.8
sklearn: 0.20.0

My original dataset is of shape:

<20210x4324 sparse matrix of type '<class 'numpy.float64'>'
	with 116992 stored elements in COOrdinate format>

and the truth variable in ndcg_scorer transforms the test split to shape (20206, 4324), while the predictions variable is of shape (20210, 4310).

So this is what's causing the index out of bounds error.

Edit: By changing the p variable, I managed to correct the predictions shape, but I don't understand why the truth variable is of shape (20206, 4324). My guess is the same as yours @antonioalegria, that in LeavePOutByGroup, in one of the splits there are users that don't have purchased some products, hence the full dimensions are not restored in truth

Okay, by filtering out purchases with fewer customers than x (trying out different values), I managed to get a correct truth shape, but now the predictions shape is off. Aah... :) Do you know of any heuristic @jbochi?

@seb799
Copy link

seb799 commented Jan 17, 2019

@joddm @antonioalegria
From my understanding, p in LeavePOutByGroup() should be <= to the (minimum number of items per user)/2.
For exemple, if your dataset has a user with activity for only 4 items, p should be <= 2.

Either you rebuild your dataset to include only users with activity for more products, or you filter out users with less than p*2 products from the test sets.

Hope that makes sense

It resolved the index out of bound error on my end.

See also

@DaStapo
Copy link

DaStapo commented Aug 23, 2020

If my dataset is mostly just 2 items per users, I assume LeavePOutByGroup is not the way to go? Because if I understand correctly, this would mean that each split would have mostly 1 item per users and therefore the model has nothing to learn.

@kylemcmearty
Copy link

@jbochi what is the license on this gist?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment