nwut/evaluation.ipynb

## evaluation.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.sparse import coo_matrix\n",
    "from implicit.als import AlternatingLeastSquares\n",
    "from implicit.utils import nonzeros"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project_id = \"CHANGEME\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requesting query... ok.\n",
      "Query running...\n",
      "  Elapsed 11.25 s. Waiting...\n",
      "Query done.\n",
      "Processed: 9.4 GB\n",
      "\n",
      "Retrieving results...\n",
      "  Got page: 1; 46.0% done. Elapsed 24.05 s.\n",
      "  Got page: 2; 92.0% done. Elapsed 41.02 s.\n",
      "  Got page: 3; 100.0% done. Elapsed 49.03 s.\n",
      "Got 217819 rows.\n",
      "\n",
      "Total time taken 50.26 s.\n",
      "Finished at 2017-07-08 10:51:18.\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"\n",
    "WITH stars AS (\n",
    "     SELECT DISTINCT actor.login AS user, repo.name AS repo\n",
    "     FROM `githubarchive.month.2017*`\n",
    "     WHERE type=\"WatchEvent\"\n",
    "),\n",
    "repositories_stars AS (\n",
    "     SELECT repo, COUNT(*) as c FROM stars GROUP BY repo\n",
    "     ORDER BY c DESC\n",
    "     LIMIT 1000\n",
    "),\n",
    "users_stars AS (\n",
    "    SELECT user, COUNT(*) as c FROM  stars\n",
    "    WHERE repo IN (SELECT repo FROM repositories_stars)\n",
    "    GROUP BY user\n",
    "    HAVING c >= 10 AND C <= 150\n",
    "    LIMIT 10000\n",
    ")\n",
    "SELECT user, repo FROM stars\n",
    "WHERE repo IN (SELECT repo FROM repositories_stars)\n",
    "AND user IN (SELECT user FROM users_stars)\n",
    "\"\"\"\n",
    "\n",
    "data = pd.io.gbq.read_gbq(query, dialect=\"standard\", project_id=project_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['user'] = data['user'].astype(\"category\")\n",
    "data['repo'] = data['repo'].astype(\"category\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, confidence=40):\n",
    "        self.confidence = confidence\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "\n",
    "    def transform(self, X, y=None):\n",
    "        return coo_matrix((np.ones(X.shape[0]),\n",
    "                           (X['repo'].cat.codes.copy(),\n",
    "                            X['user'].cat.codes.copy()))) * self.confidence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ALSEstimator(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, factors=50,\n",
    "                       regularization=0.01,\n",
    "                       iterations=10,\n",
    "                       filter_seen=True):\n",
    "        self.factors = factors\n",
    "        self.regularization = regularization\n",
    "        self.iterations = iterations\n",
    "        self.filter_seen = filter_seen\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        self.model = AlternatingLeastSquares(factors=self.factors,\n",
    "                                             regularization=self.regularization,\n",
    "                                             iterations=self.iterations,\n",
    "                                             dtype=np.float32,\n",
    "                                             use_native=True)\n",
    "        self.model.fit(X)\n",
    "        if self.filter_seen:\n",
    "            self.fit_X = X\n",
    "        return self\n",
    "    \n",
    "    def predict(self, X, y=None):\n",
    "        predictions = np.dot(self.model.item_factors, self.model.user_factors.T)\n",
    "        if self.filter_seen:\n",
    "            predictions[self.fit_X.nonzero()] = -99\n",
    "        return predictions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#https://gist.github.com/mblondel/7337391\n",
    "\n",
    "def dcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
    "    \"\"\"Discounted cumulative gain (DCG) at rank k\n",
    "    Parameters\n",
    "    ----------\n",
    "    y_true : array-like, shape = [n_samples]\n",
    "        Ground truth (true relevance labels).\n",
    "    y_score : array-like, shape = [n_samples]\n",
    "        Predicted scores.\n",
    "    k : int\n",
    "        Rank.\n",
    "    gains : str\n",
    "        Whether gains should be \"exponential\" (default) or \"linear\".\n",
    "    Returns\n",
    "    -------\n",
    "    DCG @k : float\n",
    "    \"\"\"\n",
    "    order = np.argsort(y_score)[::-1]\n",
    "    y_true = np.take(y_true, order[:k])\n",
    "\n",
    "    if gains == \"exponential\":\n",
    "        gains = 2 ** y_true - 1\n",
    "    elif gains == \"linear\":\n",
    "        gains = y_true\n",
    "    else:\n",
    "        raise ValueError(\"Invalid gains option.\")\n",
    "\n",
    "    # highest rank is 1 so +2 instead of +1\n",
    "    discounts = np.log2(np.arange(len(y_true)) + 2)\n",
    "    return np.sum(gains / discounts)\n",
    "\n",
    "\n",
    "def ndcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
    "    \"\"\"Normalized discounted cumulative gain (NDCG) at rank k\n",
    "    Parameters\n",
    "    ----------\n",
    "    y_true : array-like, shape = [n_samples]\n",
    "        Ground truth (true relevance labels).\n",
    "    y_score : array-like, shape = [n_samples]\n",
    "        Predicted scores.\n",
    "    k : int\n",
    "        Rank.\n",
    "    gains : str\n",
    "        Whether gains should be \"exponential\" (default) or \"linear\".\n",
    "    Returns\n",
    "    -------\n",
    "    NDCG @k : float\n",
    "    \"\"\"\n",
    "    best = dcg_score(y_true, y_true, k, gains)\n",
    "    if best == 0:\n",
    "        return 0    \n",
    "    actual = dcg_score(y_true, y_score, k, gains)\n",
    "    return actual / best"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_col(Y, col):\n",
    "    return np.squeeze(np.asarray(Y[:,col]))\n",
    "\n",
    "def ndcg_score_matrix(Y_true, Y_score, k=10, gains=\"exponential\"):\n",
    "    score = 0.0\n",
    "    n_users = Y_true.shape[1]\n",
    "    for u in range(n_users):\n",
    "        s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))\n",
    "        score += s\n",
    "    return score / n_users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import PredefinedSplit\n",
    "\n",
    "class LeavePOutByGroup():\n",
    "    def __init__(self, X, p=5, n_splits=2):\n",
    "        self.X = X\n",
    "        self.p = p\n",
    "        self.n_splits = n_splits\n",
    "        test_fold = self.X.groupby(\"user\").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)\n",
    "        self.s = PredefinedSplit(test_fold)\n",
    "\n",
    "    def get_n_splits(self, X=None, y=None, groups=None):\n",
    "        return self.n_splits\n",
    "\n",
    "    def split(self, X=None, y=None, groups=None):\n",
    "        return self.s.split()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def ndcg_scorer(estimator, X_test):\n",
    "    truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()\n",
    "    predictions = estimator.predict(X_test)\n",
    "    return ndcg_score_matrix(truth, predictions, k=10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "rec_pipeline = Pipeline([\n",
    "        ('matrix', UtilityMatrixTransformer()),\n",
    "        ('als', ALSEstimator()),\n",
    "])\n",
    "\n",
    "param_grid = [\n",
    "    {\n",
    "        'matrix__confidence': [1, 3, 10, 40, 100],\n",
    "        'als__factors': [20, 50, 100],\n",
    "        'als__regularization': [1e-2, 1e-3, 1e-4],\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 13.3min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=<__main__.LeavePOutByGroup instance at 0x1211de440>,\n",
       "       error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('matrix', UtilityMatrixTransformer(confidence=40)), ('als', ALSEstimator(factors=50, filter_seen=True, iterations=10, regularization=0.01))]),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid=[{'matrix__confidence': [1, 3, 10, 40, 100], 'als__factors': [20, 50, 100], 'als__regularization': [0.01, 0.001, 0.0001]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=<function ndcg_scorer at 0x120f70500>, verbose=1)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values(\"user\")\n",
    "grid_search = GridSearchCV(rec_pipeline, param_grid,\n",
    "                           cv=LeavePOutByGroup(shuffled_train_set, p=5, n_splits=3),\n",
    "                           scoring=ndcg_scorer, verbose=1)\n",
    "grid_search.fit(shuffled_train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.12182770483269723, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.13166824473805691, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.13282873335943512, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.11039252745416286, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.089016691660527295, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.01})\n",
      "(0.12093506965903911, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.13349651687119168, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.13296881851844261, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.11066298985479994, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.090079804327817939, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.001})\n",
      "(0.12115380609913633, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.13149519337094234, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.1324632559745306, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.11146953791233, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.087150593455471825, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.0001})\n",
      "(0.11191469606402717, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.13091750356245202, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.12765971686755495, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.10417408070456254, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.08646553233734848, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.01})\n",
      "(0.11191591881133119, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.13101055863881378, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.12704689426232149, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.10546083019755575, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.088077539716670858, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.001})\n",
      "(0.11183762637497301, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.13123515836596611, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.12755216556143176, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.10545558137554324, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.085301802039959324, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.0001})\n",
      "(0.10611469337423451, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.12196884692068968, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.1175596768388132, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.097768165103126289, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.083236186967734632, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.01})\n",
      "(0.10592410193883216, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.12342558447935369, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.11774383257111316, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.10114449211940411, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.085345035644795259, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.001})\n",
      "(0.10588644986046256, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.12292610723641109, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.11897882932070754, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.10299449602711824, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.0001})\n",
      "(0.086555398366424466, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.0001})\n"
     ]
    }
   ],
   "source": [
    "cvres = grid_search.cv_results_\n",
    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
    "    print(mean_score, params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## recommendations.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              recommendations.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from scipy.sparse import coo_matrix\n",
	"from implicit.als import AlternatingLeastSquares\n",
	"from implicit.utils import nonzeros"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"project_id = \"CHANGEME\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Requesting query... ok.\n",
	"Query running...\n",
	" Elapsed 11.25 s. Waiting...\n",
	"Query done.\n",
	"Processed: 9.4 GB\n",
	"\n",
	"Retrieving results...\n",
	" Got page: 1; 46.0% done. Elapsed 24.05 s.\n",
	" Got page: 2; 92.0% done. Elapsed 41.02 s.\n",
	" Got page: 3; 100.0% done. Elapsed 49.03 s.\n",
	"Got 217819 rows.\n",
	"\n",
	"Total time taken 50.26 s.\n",
	"Finished at 2017-07-08 10:51:18.\n"
	]
	}
	],
	"source": [
	"query = \"\"\"\n",
	"WITH stars AS (\n",
	" SELECT DISTINCT actor.login AS user, repo.name AS repo\n",
	" FROM `githubarchive.month.2017*`\n",
	" WHERE type=\"WatchEvent\"\n",
	"),\n",
	"repositories_stars AS (\n",
	" SELECT repo, COUNT(*) as c FROM stars GROUP BY repo\n",
	" ORDER BY c DESC\n",
	" LIMIT 1000\n",
	"),\n",
	"users_stars AS (\n",
	" SELECT user, COUNT(*) as c FROM stars\n",
	" WHERE repo IN (SELECT repo FROM repositories_stars)\n",
	" GROUP BY user\n",
	" HAVING c >= 10 AND C <= 150\n",
	" LIMIT 10000\n",
	")\n",
	"SELECT user, repo FROM stars\n",
	"WHERE repo IN (SELECT repo FROM repositories_stars)\n",
	"AND user IN (SELECT user FROM users_stars)\n",
	"\"\"\"\n",
	"\n",
	"data = pd.io.gbq.read_gbq(query, dialect=\"standard\", project_id=project_id)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data['user'] = data['user'].astype(\"category\")\n",
	"data['repo'] = data['repo'].astype(\"category\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.base import BaseEstimator, TransformerMixin\n",
	"from sklearn.pipeline import Pipeline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):\n",
	" def __init__(self, confidence=40):\n",
	" self.confidence = confidence\n",
	"\n",
	" def fit(self, X, y=None):\n",
	" return self\n",
	"\n",
	" def transform(self, X, y=None):\n",
	" return coo_matrix((np.ones(X.shape[0]),\n",
	" (X['repo'].cat.codes.copy(),\n",
	" X['user'].cat.codes.copy()))) * self.confidence"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class ALSEstimator(BaseEstimator, TransformerMixin):\n",
	" def __init__(self, factors=50,\n",
	" regularization=0.01,\n",
	" iterations=10,\n",
	" filter_seen=True):\n",
	" self.factors = factors\n",
	" self.regularization = regularization\n",
	" self.iterations = iterations\n",
	" self.filter_seen = filter_seen\n",
	"\n",
	" def fit(self, X, y=None):\n",
	" self.model = AlternatingLeastSquares(factors=self.factors,\n",
	" regularization=self.regularization,\n",
	" iterations=self.iterations,\n",
	" dtype=np.float32,\n",
	" use_native=True)\n",
	" self.model.fit(X)\n",
	" if self.filter_seen:\n",
	" self.fit_X = X\n",
	" return self\n",
	" \n",
	" def predict(self, X, y=None):\n",
	" predictions = np.dot(self.model.item_factors, self.model.user_factors.T)\n",
	" if self.filter_seen:\n",
	" predictions[self.fit_X.nonzero()] = -99\n",
	" return predictions\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#https://gist.github.com/mblondel/7337391\n",
	"\n",
	"def dcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
	" \"\"\"Discounted cumulative gain (DCG) at rank k\n",
	" Parameters\n",
	" ----------\n",
	" y_true : array-like, shape = [n_samples]\n",
	" Ground truth (true relevance labels).\n",
	" y_score : array-like, shape = [n_samples]\n",
	" Predicted scores.\n",
	" k : int\n",
	" Rank.\n",
	" gains : str\n",
	" Whether gains should be \"exponential\" (default) or \"linear\".\n",
	" Returns\n",
	" -------\n",
	" DCG @k : float\n",
	" \"\"\"\n",
	" order = np.argsort(y_score)[::-1]\n",
	" y_true = np.take(y_true, order[:k])\n",
	"\n",
	" if gains == \"exponential\":\n",
	" gains = 2 ** y_true - 1\n",
	" elif gains == \"linear\":\n",
	" gains = y_true\n",
	" else:\n",
	" raise ValueError(\"Invalid gains option.\")\n",
	"\n",
	" # highest rank is 1 so +2 instead of +1\n",
	" discounts = np.log2(np.arange(len(y_true)) + 2)\n",
	" return np.sum(gains / discounts)\n",
	"\n",
	"\n",
	"def ndcg_score(y_true, y_score, k=10, gains=\"exponential\"):\n",
	" \"\"\"Normalized discounted cumulative gain (NDCG) at rank k\n",
	" Parameters\n",
	" ----------\n",
	" y_true : array-like, shape = [n_samples]\n",
	" Ground truth (true relevance labels).\n",
	" y_score : array-like, shape = [n_samples]\n",
	" Predicted scores.\n",
	" k : int\n",
	" Rank.\n",
	" gains : str\n",
	" Whether gains should be \"exponential\" (default) or \"linear\".\n",
	" Returns\n",
	" -------\n",
	" NDCG @k : float\n",
	" \"\"\"\n",
	" best = dcg_score(y_true, y_true, k, gains)\n",
	" if best == 0:\n",
	" return 0 \n",
	" actual = dcg_score(y_true, y_score, k, gains)\n",
	" return actual / best"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_col(Y, col):\n",
	" return np.squeeze(np.asarray(Y[:,col]))\n",
	"\n",
	"def ndcg_score_matrix(Y_true, Y_score, k=10, gains=\"exponential\"):\n",
	" score = 0.0\n",
	" n_users = Y_true.shape[1]\n",
	" for u in range(n_users):\n",
	" s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))\n",
	" score += s\n",
	" return score / n_users"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import PredefinedSplit\n",
	"\n",
	"class LeavePOutByGroup():\n",
	" def __init__(self, X, p=5, n_splits=2):\n",
	" self.X = X\n",
	" self.p = p\n",
	" self.n_splits = n_splits\n",
	" test_fold = self.X.groupby(\"user\").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)\n",
	" self.s = PredefinedSplit(test_fold)\n",
	"\n",
	" def get_n_splits(self, X=None, y=None, groups=None):\n",
	" return self.n_splits\n",
	"\n",
	" def split(self, X=None, y=None, groups=None):\n",
	" return self.s.split()\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def ndcg_scorer(estimator, X_test):\n",
	" truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()\n",
	" predictions = estimator.predict(X_test)\n",
	" return ndcg_score_matrix(truth, predictions, k=10)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import cross_val_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.model_selection import GridSearchCV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [],
	"source": [
	"rec_pipeline = Pipeline([\n",
	" ('matrix', UtilityMatrixTransformer()),\n",
	" ('als', ALSEstimator()),\n",
	"])\n",
	"\n",
	"param_grid = [\n",
	" {\n",
	" 'matrix__confidence': [1, 3, 10, 40, 100],\n",
	" 'als__factors': [20, 50, 100],\n",
	" 'als__regularization': [1e-2, 1e-3, 1e-4],\n",
	" }\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fitting 3 folds for each of 45 candidates, totalling 135 fits\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Done 135 out of 135 \| elapsed: 13.3min finished\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"GridSearchCV(cv=<__main__.LeavePOutByGroup instance at 0x1211de440>,\n",
	" error_score='raise',\n",
	" estimator=Pipeline(memory=None,\n",
	" steps=[('matrix', UtilityMatrixTransformer(confidence=40)), ('als', ALSEstimator(factors=50, filter_seen=True, iterations=10, regularization=0.01))]),\n",
	" fit_params=None, iid=True, n_jobs=1,\n",
	" param_grid=[{'matrix__confidence': [1, 3, 10, 40, 100], 'als__factors': [20, 50, 100], 'als__regularization': [0.01, 0.001, 0.0001]}],\n",
	" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
	" scoring=<function ndcg_scorer at 0x120f70500>, verbose=1)"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values(\"user\")\n",
	"grid_search = GridSearchCV(rec_pipeline, param_grid,\n",
	" cv=LeavePOutByGroup(shuffled_train_set, p=5, n_splits=3),\n",
	" scoring=ndcg_scorer, verbose=1)\n",
	"grid_search.fit(shuffled_train_set)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"grid_search.best_params_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(0.12182770483269723, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.13166824473805691, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.13282873335943512, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.11039252745416286, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.089016691660527295, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.01})\n",
	"(0.12093506965903911, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.13349651687119168, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.13296881851844261, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.11066298985479994, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.090079804327817939, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.001})\n",
	"(0.12115380609913633, {'matrix__confidence': 1, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.13149519337094234, {'matrix__confidence': 3, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.1324632559745306, {'matrix__confidence': 10, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.11146953791233, {'matrix__confidence': 40, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.087150593455471825, {'matrix__confidence': 100, 'als__factors': 20, 'als__regularization': 0.0001})\n",
	"(0.11191469606402717, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.13091750356245202, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.12765971686755495, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.10417408070456254, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.08646553233734848, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.01})\n",
	"(0.11191591881133119, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.13101055863881378, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.12704689426232149, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.10546083019755575, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.088077539716670858, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.001})\n",
	"(0.11183762637497301, {'matrix__confidence': 1, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.13123515836596611, {'matrix__confidence': 3, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.12755216556143176, {'matrix__confidence': 10, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.10545558137554324, {'matrix__confidence': 40, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.085301802039959324, {'matrix__confidence': 100, 'als__factors': 50, 'als__regularization': 0.0001})\n",
	"(0.10611469337423451, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.12196884692068968, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.1175596768388132, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.097768165103126289, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.083236186967734632, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.01})\n",
	"(0.10592410193883216, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.12342558447935369, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.11774383257111316, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.10114449211940411, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.085345035644795259, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.001})\n",
	"(0.10588644986046256, {'matrix__confidence': 1, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.12292610723641109, {'matrix__confidence': 3, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.11897882932070754, {'matrix__confidence': 10, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.10299449602711824, {'matrix__confidence': 40, 'als__factors': 100, 'als__regularization': 0.0001})\n",
	"(0.086555398366424466, {'matrix__confidence': 100, 'als__factors': 100, 'als__regularization': 0.0001})\n"
	]
	}
	],
	"source": [
	"cvres = grid_search.cv_results_\n",
	"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
	" print(mean_score, params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}