Skip to content

Instantly share code, notes, and snippets.

@phonism
Created March 16, 2016 12:46
Show Gist options
  • Save phonism/9e1f63ba559ee37afea7 to your computer and use it in GitHub Desktop.
Save phonism/9e1f63ba559ee37afea7 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import math
import xgboost as xgb
train = pd.read_csv('../data/train/PPD_Training_Master_GBK_3_1_Training_Set.csv')
test = pd.read_csv('../data/test/PPD_Master_GBK_2_Test_Set.csv')
all_data = train.append(test)
all_data = train
all_data['cnt'] = all_data.isnull().sum(axis=1)
all_data.fillna(-1, inplace=True)
cat_cols = ["UserInfo_1", 'UserInfo_2', 'UserInfo_3',
'UserInfo_4',
'UserInfo_5',
'UserInfo_6',
'UserInfo_7',
'UserInfo_8',
'UserInfo_9',
'UserInfo_11',
'UserInfo_12',
'UserInfo_13',
'UserInfo_14',
'UserInfo_15',
'UserInfo_16',
'UserInfo_18',
'UserInfo_19',
'UserInfo_20',
'UserInfo_21',
'UserInfo_22',
'UserInfo_23',
"UserInfo_24",
'Education_Info1',
'Education_Info2',
'Education_Info3',
'Education_Info4',
'Education_Info5',
'Education_Info6',
'Education_Info7',
'Education_Info8',
'WeblogInfo_19',
'WeblogInfo_20',
'WeblogInfo_21',
'SocialNetwork_1',
'SocialNetwork_2',
'SocialNetwork_7',
'SocialNetwork_12']
for col in cat_cols:
if col in all_data.columns.values:
all_data[col] = pd.factorize(all_data[col])[0]
def nomarlize_time_type(t):
if len(t.split("/")[0]) < 4:
return "/".join([t.split("/")[2], t.split("/")[1], t.split("/")[0]])
return t
all_data['ListingInfo'] = all_data['ListingInfo'].apply(nomarlize_time_type)
all_data["year"] = all_data["ListingInfo"].apply(lambda x: x[0:4])
all_data["month"] = all_data["ListingInfo"].apply(lambda x: x.split("/")[1])
all_data["day"] = all_data["ListingInfo"].apply(lambda x: x.split("/")[2])
all_data = all_data.drop(["ListingInfo"], axis=1)
all_data.fillna(-1, inplace=True)
all_data.to_csv("test.csv", columns=all_data.columns.values, index=False)
all_data = pd.read_csv("test.csv")
cv_train = all_data[:24000]
cv_test = all_data[24000:]
dtrain = xgb.DMatrix(cv_train.drop(["Idx", "target"], axis=1), cv_train["target"].values)
dtest = xgb.DMatrix(cv_test.drop(["Idx", "target"], axis=1), label=cv_test["target"].values)
params = {}
# params['objective'] = 'binary:logistic'
params["objective"] = "reg:logistic"
params['eta'] = 0.007
# params['eta'] = 0.02
params['min_child_weight'] = 1
params['subsample'] = 0.66
params['colsample_bytree'] = 0.8
params['max_depth'] = 10
params['eval_metric'] = 'auc'
params['nthread'] = 5
params['silent'] = 1
num_rounds = 2100
watchlist = [(dtrain, 'train'), (dtest, 'val')]
model = xgb.train(params, dtrain, num_rounds, watchlist)
y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
from sklearn import metrics
y = cv_test["target"].values + 1
fpr, tpr, th = metrics.roc_curve(y, y_pred, pos_label=2)
print metrics.auc(fpr, tpr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment