Created
August 7, 2013 15:23
-
-
Save okay001/6175061 to your computer and use it in GitHub Desktop.
Logistic regression
for KAGGLEAmazon competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <codecell> | |
from numpy import array, hstack | |
from sklearn import metrics, cross_validation, linear_model | |
from scipy import sparse | |
from itertools import combinations | |
import numpy as np | |
import pandas as pd | |
import csv | |
SEED = 25 | |
# <codecell> | |
def group_data(data, degree=3, hash=hash): | |
""" | |
numpy.array -> numpy.array | |
numpy.array型のリストを返す | |
Groups all columns of data into all combinations of triples | |
""" | |
#空のリストを作成 | |
new_data = [] | |
#m:dataの行(32769) n:dataの列(10) | |
m,n = data.shape | |
print "8C", degree | |
for indicies in combinations(range(n), degree): | |
print indicies | |
new_data.append([hash(tuple(v)) for v in data[:,indicies]]) | |
return array(new_data).T #転置はnumpy.array型で有効 | |
# <codecell> | |
def OneHotEncoder(data, keymap=None): | |
""" | |
OneHotEncoder takes data matrix with categorical columns and | |
converts it to a sparse binary matrix. | |
カテゴリ列をもったデータ行列 → 疎なバイナリ行列に変換 | |
Returns sparse binary matrix and keymap mapping categories to indicies. | |
If a keymap is supplied on input it will be used instead of creating one | |
and any categories appearing in the data that are not in the keymap are | |
ignored | |
返り値 疎なバイナリと、添字とカテゴリに対応するキーマッピング | |
インプットデータにキーマップが存在するなら、1を作る代わりにそれを使用し、キーマップに含まれてない | |
データに現れる任意のカテゴリは無視される | |
""" | |
if keymap is None: | |
keymap = [] | |
for col in data.T: | |
uniques = set(list(col)) | |
keymap.append(dict((key, i) for i, key in enumerate(uniques))) | |
total_pts = data.shape[0] | |
outdat = [] | |
for i, col in enumerate(data.T): | |
km = keymap[i] | |
num_labels = len(km) | |
spmat = sparse.lil_matrix((total_pts, num_labels)) | |
for j, val in enumerate(col): | |
if val in km: | |
spmat[j, km[val]] = 1 | |
outdat.append(spmat) | |
outdat = sparse.hstack(outdat).tocsr() | |
return outdat, keymap | |
# <codecell> | |
# This loop essentially from Paul's starter code | |
def cv_loop(X, y, model, N): | |
mean_auc = 0. | |
for i in range(N): | |
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( | |
X, y, test_size=.20, | |
random_state = i*SEED) | |
model.fit(X_train, y_train) | |
preds = model.predict_proba(X_cv)[:,1] | |
auc = metrics.auc_score(y_cv, preds) | |
# print "AUC (fold %d/%d): %f" % (i + 1, N, auc) | |
mean_auc += auc | |
return mean_auc/N | |
# <codecell> | |
train = 'data/train.csv' | |
test = 'data/test.csv' | |
train_data = pd.read_csv(train) | |
test_data = pd.read_csv(test) | |
#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる | |
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1])) | |
num_train = np.shape(train_data)[0] | |
#print "train_data.shape:", train_data.shape | |
#print train_data.ix[:1,] | |
#print "all_data.shape:", all_data.shape | |
#print all_data[:2,] | |
print "Transforming data…" | |
dp = group_data(all_data, degree=2) #2種類ずつグルーピングした2つのデータ列 | |
dt = group_data(all_data, degree=3) #3種類ずつグルーピングしたデータ列 | |
y = array(train_data.ACTION) | |
X = all_data[:num_train] # ~学習用データ行まで | |
X_2 = dp[:num_train] | |
X_3 = dt[:num_train] | |
X_test = all_data[num_train:] #テスト用データ行 ~ | |
X_test_2 = dp[num_train:] | |
X_test_3 = dt[num_train:] | |
X_train_all = np.hstack((X, X_2, X_3)) | |
X_test_all = np.hstack((X_test, X_test_2, X_test_3)) | |
#print X_train_all[1, :15] | |
# <codecell> | |
csvf = open('X_train_all.csv', 'w') | |
writer = csv.writer(csvf) | |
writer.writerows(X_train_all) | |
csvf.close | |
# <codecell> | |
csvf = open('X_test_all.csv', 'w') | |
writer = csv.writer(csvf) | |
writer.writerows(X_test_all) | |
csvf.close | |
# <codecell> | |
num_features = X_train_all.shape[1] | |
print "num_features:", num_features | |
model = linear_model.LogisticRegression() | |
# <codecell> | |
Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] | |
# <codecell> | |
print "Performing greedy feature selection..." | |
score_hist = [] | |
N = 10 | |
#good_features = set([]) | |
good_features = set([0, 8, 9, 10, 12, 19, 34, 36, 37, 38, 42, 43, 47, 53, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85]) | |
# Greedy feature selection loop | |
#ずっと続ける 最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける | |
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: | |
scores = [] | |
#good説明変数以外の変数を足して交差検定してみる | |
for f in range(len(Xts)): | |
if f not in good_features: | |
feats = list(good_features) + [f] #新しく説明変数を追加 | |
Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() | |
score = cv_loop(Xt, y, model, N) | |
scores.append((score, f)) | |
print "Feature: %i Mean AUC: %f" % (f, score) #i番目の変数を説明変数に加えたときのAUC | |
good_features.add(sorted(scores)[-1][1]) #一番よかった変数をgood説明変数に追加 | |
score_hist.append(sorted(scores)[-1]) #よかったスコアと変数を記録 | |
print "Current features: %s" % sorted(list(good_features)) | |
print "-- -- -- -- --" | |
# Remove last added feature from good_features | |
good_features.remove(score_hist[-1][1]) | |
good_features = sorted(list(good_features)) | |
print "Selected features %s" % good_features |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment