import pandas as pd | |
import numpy as np | |
from sklearn.cross_validation import train_test_split | |
import xgboost as xgb | |
import operator | |
types = {'f1': np.dtype(float), 'f2': np.dtype(float), 'f3': np.dtype(float), 'f4': np.dtype(float), | |
'f5': np.dtype(float), 'f6': np.dtype(float), 'f7': np.dtype(float), 'f8': np.dtype(float), | |
'f9': np.dtype(float), 'f10': np.dtype(float), 'f11': np.dtype(float), 'f12': np.dtype(float), | |
'f13': np.dtype(float), 'f14': np.dtype(float), 'c1': np.dtype(str), 'validation': np.dtype(int), | |
'target': np.dtype(int)} | |
train_valid = pd.read_csv("numerai_training_data.csv",dtype=types) | |
mappings = {c:i for i, c in enumerate(pd.unique(train_valid.c1.ravel()))} | |
train_valid.c1.replace(mappings, inplace=True) | |
train = train_valid[train_valid['validation'] == 0] | |
valid = train_valid[train_valid['validation'] == 1] | |
params = {"objective": "binary:logistic", | |
"booster" : "gbtree", | |
"eta": 0.1, | |
"max_depth": 30, | |
"subsample": 0.9, | |
"colsample_bytree": 0.7, | |
"silent": 1, | |
"seed": 0, | |
'eval_metric': 'auc' | |
} | |
num_boost_round = 500 | |
features = ['f{}'.format(i) for i in range(1,15)] + ['c1'] | |
X_train, X_valid = train_test_split(train_valid, test_size=0.20, random_state=10) | |
#X_train, X_valid = train, valid # eval-auc:0.513770 | |
y_train = np.log1p(X_train.target) | |
y_valid = np.log1p(X_valid.target) | |
dtrain = xgb.DMatrix(X_train[features], y_train) | |
dvalid = xgb.DMatrix(X_valid[features], y_valid) | |
watchlist = [(dtrain, 'train'), (dvalid, 'eval')] | |
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \ | |
early_stopping_rounds=100, verbose_eval=True) | |
print("Validating") | |
yhat = gbm.predict(xgb.DMatrix(X_valid[features])) | |
testtypes = {'f1': np.dtype(float), 'f2': np.dtype(float), 'f3': np.dtype(float), 'f4': np.dtype(float), | |
'f5': np.dtype(float), 'f6': np.dtype(float), 'f7': np.dtype(float), 'f8': np.dtype(float), | |
'f9': np.dtype(float), 'f10': np.dtype(float), 'f11': np.dtype(float), 'f12': np.dtype(float), | |
'f13': np.dtype(float), 'f14': np.dtype(float), 'c1': np.dtype(str), 't_id':np.dtype(str)} | |
test = pd.read_csv("numerai_tournament_data.csv",dtype=testtypes) | |
test.c1.replace(mappings, inplace=True) | |
dtest = xgb.DMatrix(test[features]) | |
test_probs = gbm.predict(dtest) | |
# Make Submission | |
result = pd.DataFrame({"t_id": test["t_id"], 'probability': np.expm1(test_probs)}) | |
result = result[['t_id', 'probability']] | |
result.to_csv("sub4.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment