Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Sklearn Yahoo LTRC 2010 Benchmark script
import numpy as np
import svmlight_loader
from sklearn.ensemble import GradientBoostingRegressor
from time import time
ROOT_DIR = '/home/pprett/corpora/yahoo-ltrc-2010/data'
X_train, y_train = svmlight_loader.load_svmlight_file(ROOT_DIR + '/set1.train.txt',
n_features=700,
dtype=np.float32,
buffer_mb=500)
X_train = np.asfortranarray(X_train.toarray())
X_test, y_test = svmlight_loader.load_svmlight_file(ROOT_DIR + '/set1.test.txt',
n_features=700,
dtype=np.float32,
buffer_mb=500)
X_test = np.asfortranarray(X_test.toarray())
clf = GradientBoostingRegressor(n_estimators=200,
max_depth=3,
min_split=15,
learn_rate=0.05)
t0 = time()
def monitor(model, i):
print("Iteration %d\t%ds" % (i, time() - t0))
clf.fit(X_train, y_train, monitor=monitor)
y_pred = clf.predict(X_test)
np.save(ROOT_DIR + '/y_pred.npy', y_pred)
################################################################################
# get labels and qids from test set
qids = !cut -f -2 -d' ' data/set1.test.txt
qids = [map(int, q.split(' qid:')) for q in qids]
# group labels and scores by qid
it = groupby(izip(qids, y_pred), lambda x: x[0][1])
data = [[(l,s) for (l,_), s in subit] for qid, subit in it]
labels = [[l for l,s in d] for d in data]
scores = [[s for l,s in d] for d in data]
ranks = []
for s in scores:
ranks.append(np.zeros((len(s),), dtype=np.int))
ranks[-1][np.argsort(s)[::-1]] = np.arange(1, len(s) + 1)
# use ``evaluate.evaluate_submission(labels,ranks,k=10)`` to get NDCG and MRR scores
# you can find ``evaluate.py`` on the Yahoo LTCR 2010 website.
# scores (0.45268919338794184, 0.77109134293735371)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment