""" | |
Implementation of pairwise ranking using scikit-learn LinearSVC | |
Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich, | |
T. Graepel, K. Obermayer. | |
Authors: Fabian Pedregosa <fabian@fseoane.net> | |
Alexandre Gramfort <alexandre.gramfort@inria.fr> | |
""" | |
import itertools | |
import numpy as np | |
from sklearn import svm, linear_model, cross_validation | |
def transform_pairwise(X, y): | |
"""Transforms data into pairs with balanced labels for ranking | |
Transforms a n-class ranking problem into a two-class classification | |
problem. Subclasses implementing particular strategies for choosing | |
pairs should override this method. | |
In this method, all pairs are choosen, except for those that have the | |
same target value. The output is an array of balanced classes, i.e. | |
there are the same number of -1 as +1 | |
Parameters | |
---------- | |
X : array, shape (n_samples, n_features) | |
The data | |
y : array, shape (n_samples,) or (n_samples, 2) | |
Target labels. If it's a 2D array, the second column represents | |
the grouping of samples, i.e., samples with different groups will | |
not be considered. | |
Returns | |
------- | |
X_trans : array, shape (k, n_feaures) | |
Data as pairs | |
y_trans : array, shape (k,) | |
Output class labels, where classes have values {-1, +1} | |
""" | |
X_new = [] | |
y_new = [] | |
y = np.asarray(y) | |
if y.ndim == 1: | |
y = np.c_[y, np.ones(y.shape[0])] | |
comb = itertools.combinations(range(X.shape[0]), 2) | |
for k, (i, j) in enumerate(comb): | |
if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]: | |
# skip if same target or different group | |
continue | |
X_new.append(X[i] - X[j]) | |
y_new.append(np.sign(y[i, 0] - y[j, 0])) | |
# output balanced classes | |
if y_new[-1] != (-1) ** k: | |
y_new[-1] = - y_new[-1] | |
X_new[-1] = - X_new[-1] | |
return np.asarray(X_new), np.asarray(y_new).ravel() | |
class RankSVM(svm.LinearSVC): | |
"""Performs pairwise ranking with an underlying LinearSVC model | |
Input should be a n-class ranking problem, this object will convert it | |
into a two-class classification problem, a setting known as | |
`pairwise ranking`. | |
See object :ref:`svm.LinearSVC` for a full description of parameters. | |
""" | |
def fit(self, X, y): | |
""" | |
Fit a pairwise ranking model. | |
Parameters | |
---------- | |
X : array, shape (n_samples, n_features) | |
y : array, shape (n_samples,) or (n_samples, 2) | |
Returns | |
------- | |
self | |
""" | |
X_trans, y_trans = transform_pairwise(X, y) | |
super(RankSVM, self).fit(X_trans, y_trans) | |
return self | |
def predict(self, X): | |
""" | |
Predict an ordering on X. For a list of n samples, this method | |
returns a list from 0 to n-1 with the relative order of the rows of X. | |
Parameters | |
---------- | |
X : array, shape (n_samples, n_features) | |
Returns | |
------- | |
ord : array, shape (n_samples,) | |
Returns a list of integers representing the relative order of | |
the rows in X. | |
""" | |
if hasattr(self, 'coef_'): | |
np.argsort(np.dot(X, self.coef_.T)) | |
else: | |
raise ValueError("Must call fit() prior to predict()") | |
def score(self, X, y): | |
""" | |
Because we transformed into a pairwise problem, chance level is at 0.5 | |
""" | |
X_trans, y_trans = transform_pairwise(X, y) | |
return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) | |
if __name__ == '__main__': | |
# as showcase, we will create some non-linear data | |
# and print the performance of ranking vs linear regression | |
np.random.seed(1) | |
n_samples, n_features = 300, 5 | |
true_coef = np.random.randn(n_features) | |
X = np.random.randn(n_samples, n_features) | |
noise = np.random.randn(n_samples) / np.linalg.norm(true_coef) | |
y = np.dot(X, true_coef) | |
y = np.arctan(y) # add non-linearities | |
y += .1 * noise # add noise | |
Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id | |
cv = cross_validation.KFold(n_samples, 5) | |
train, test = iter(cv).next() | |
# make a simple plot out of it | |
import pylab as pl | |
pl.scatter(np.dot(X, true_coef), y) | |
pl.title('Data to be learned') | |
pl.xlabel('<X, coef>') | |
pl.ylabel('y') | |
pl.show() | |
# print the performance of ranking | |
rank_svm = RankSVM().fit(X[train], Y[train]) | |
print 'Performance of ranking ', rank_svm.score(X[test], Y[test]) | |
# and that of linear regression | |
ridge = linear_model.RidgeCV(fit_intercept=True) | |
ridge.fit(X[train], y[train]) | |
X_test_trans, y_test_trans = transform_pairwise(X[test], y[test]) | |
score = np.mean(np.sign(np.dot(X_test_trans, ridge.coef_)) == y_test_trans) | |
print 'Performance of linear regression ', score |
This comment has been minimized.
This comment has been minimized.
Thank you for this code! |
This comment has been minimized.
This comment has been minimized.
@suzanv I was wondering about this as well. If you think about this being used in production/an application, the predict() method returns the indices of the data that's passed to it in ordinal fashion, and therefore (I think) you can only submit observations that are in the same group. In other words, although the model can be fit using data from multiple groups (multiple query_id's), when using predict() in production, you should only submit data to it that's of the same group (...the same query_id). @suzanv (and @agramfort) do you have the same understanding as me? If so, then, since all observations should have a unique ordinal position, there won't be any case where there is more than one observation that has the same target_value (when using predict() in production), and so in response to @suzanv comment, there should be a parameter added to fit() that allows you to create pairs for all observations, and predict() can use this paramter. Curious to know what you all think. |
This comment has been minimized.
This comment has been minimized.
@brendanf22 Thank you for your reply! Yes, I agree that the test data should also be grouped (e.g. per query id). Another recommendation is to add a function that outputs a ranking for the test data based on the predicted pairwise preferences (e.g. based on greedy ordering http://arxiv.org/pdf/1105.5464.pdf ), because many evaluation measures use the resulting ranking (e.g. precision@n, MAP, nDCG), and not so much the accuracy of pairwise predictions (-1 or 1) |
This comment has been minimized.
This comment has been minimized.
Sorry, I have another question/comment: For correct cross validation in a real ranking scenario, you should divide the groups (queries) in partitions, not the individual items (documents). You train on one set of queries, and evaluate on another set of queries. Instead of having a subset of the documents for a given query in each partition. |
This comment has been minimized.
This comment has been minimized.
Hi, can you tell an example of the dataset that should be used here |
This comment has been minimized.
This comment has been minimized.
@naufalkhalid http://fa.bianp.net/blog/2012/learning-to-rank-with-scikit-learn-the-pairwise-transform/ you can refer to the blog post here. |
This comment has been minimized.
Hey, just found a bug.
np.argsort(np.dot(X, self.coef_.T))
should bereturn np.argsort(np.dot(X, self.coef_.T).ravel())