Scikit-learn RF with integer encoded categoricals
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
d_train = pd.read_csv("train-intcateg-1m.csv", header=None)
d_test = pd.read_csv("test-intcateg-1m.csv", header=None)
X_train = d_train.ix[:,0:7]
y_train = d_train.ix[:,8]
X_test = d_test.ix[:,0:7]
y_test = d_test.ix[:,8]
md = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
%time, y_train)
%time phat = md.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, phat)
