Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
types = {'f1': np.dtype(float), 'f2': np.dtype(float), 'f3': np.dtype(float), 'f4': np.dtype(float),
'f5': np.dtype(float), 'f6': np.dtype(float), 'f7': np.dtype(float), 'f8': np.dtype(float),
'f9': np.dtype(float), 'f10': np.dtype(float), 'f11': np.dtype(float), 'f12': np.dtype(float),
'f13': np.dtype(float), 'f14': np.dtype(float), 'c1': np.dtype(str), 'validation': np.dtype(int),
'target': np.dtype(int)}
train_valid = pd.read_csv("numerai_training_data.csv",dtype=types)
mappings = {c:i for i, c in enumerate(pd.unique(train_valid.c1.ravel()))}
train_valid.c1.replace(mappings, inplace=True)
train = train_valid[train_valid['validation'] == 0]
valid = train_valid[train_valid['validation'] == 1]
params = {"objective": "binary:logistic",
"booster" : "gbtree",
"eta": 0.1,
"max_depth": 30,
"subsample": 0.9,
"colsample_bytree": 0.7,
"silent": 1,
"seed": 0,
'eval_metric': 'auc'
}
num_boost_round = 500
features = ['f{}'.format(i) for i in range(1,15)] + ['c1']
X_train, X_valid = train_test_split(train_valid, test_size=0.20, random_state=10)
#X_train, X_valid = train, valid # eval-auc:0.513770
y_train = np.log1p(X_train.target)
y_valid = np.log1p(X_valid.target)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
early_stopping_rounds=100, verbose_eval=True)
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
testtypes = {'f1': np.dtype(float), 'f2': np.dtype(float), 'f3': np.dtype(float), 'f4': np.dtype(float),
'f5': np.dtype(float), 'f6': np.dtype(float), 'f7': np.dtype(float), 'f8': np.dtype(float),
'f9': np.dtype(float), 'f10': np.dtype(float), 'f11': np.dtype(float), 'f12': np.dtype(float),
'f13': np.dtype(float), 'f14': np.dtype(float), 'c1': np.dtype(str), 't_id':np.dtype(str)}
test = pd.read_csv("numerai_tournament_data.csv",dtype=testtypes)
test.c1.replace(mappings, inplace=True)
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"t_id": test["t_id"], 'probability': np.expm1(test_probs)})
result = result[['t_id', 'probability']]
result.to_csv("sub4.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment