Skip to content

Instantly share code, notes, and snippets.

@okay001
Created August 7, 2013 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save okay001/6175061 to your computer and use it in GitHub Desktop.
Save okay001/6175061 to your computer and use it in GitHub Desktop.
Logistic regression for KAGGLEAmazon competition
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
from numpy import array, hstack
from sklearn import metrics, cross_validation, linear_model
from scipy import sparse
from itertools import combinations
import numpy as np
import pandas as pd
import csv
SEED = 25
# <codecell>
def group_data(data, degree=3, hash=hash):
"""
numpy.array -> numpy.array
numpy.array型のリストを返す
Groups all columns of data into all combinations of triples
"""
#空のリストを作成
new_data = []
#m:dataの行(32769) n:dataの列(10)
m,n = data.shape
print "8C", degree
for indicies in combinations(range(n), degree):
print indicies
new_data.append([hash(tuple(v)) for v in data[:,indicies]])
return array(new_data).T #転置はnumpy.array型で有効
# <codecell>
def OneHotEncoder(data, keymap=None):
"""
OneHotEncoder takes data matrix with categorical columns and
converts it to a sparse binary matrix.
カテゴリ列をもったデータ行列 → 疎なバイナリ行列に変換
Returns sparse binary matrix and keymap mapping categories to indicies.
If a keymap is supplied on input it will be used instead of creating one
and any categories appearing in the data that are not in the keymap are
ignored
返り値  疎なバイナリと、添字とカテゴリに対応するキーマッピング
インプットデータにキーマップが存在するなら、1を作る代わりにそれを使用し、キーマップに含まれてない
データに現れる任意のカテゴリは無視される
"""
if keymap is None:
keymap = []
for col in data.T:
uniques = set(list(col))
keymap.append(dict((key, i) for i, key in enumerate(uniques)))
total_pts = data.shape[0]
outdat = []
for i, col in enumerate(data.T):
km = keymap[i]
num_labels = len(km)
spmat = sparse.lil_matrix((total_pts, num_labels))
for j, val in enumerate(col):
if val in km:
spmat[j, km[val]] = 1
outdat.append(spmat)
outdat = sparse.hstack(outdat).tocsr()
return outdat, keymap
# <codecell>
# This loop essentially from Paul's starter code
def cv_loop(X, y, model, N):
mean_auc = 0.
for i in range(N):
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
X, y, test_size=.20,
random_state = i*SEED)
model.fit(X_train, y_train)
preds = model.predict_proba(X_cv)[:,1]
auc = metrics.auc_score(y_cv, preds)
# print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
mean_auc += auc
return mean_auc/N
# <codecell>
train = 'data/train.csv'
test = 'data/test.csv'
train_data = pd.read_csv(train)
test_data = pd.read_csv(test)
#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
num_train = np.shape(train_data)[0]
#print "train_data.shape:", train_data.shape
#print train_data.ix[:1,]
#print "all_data.shape:", all_data.shape
#print all_data[:2,]
print "Transforming data…"
dp = group_data(all_data, degree=2) #2種類ずつグルーピングした2つのデータ列
dt = group_data(all_data, degree=3) #3種類ずつグルーピングしたデータ列
y = array(train_data.ACTION)
X = all_data[:num_train] # ~学習用データ行まで
X_2 = dp[:num_train]
X_3 = dt[:num_train]
X_test = all_data[num_train:] #テスト用データ行 ~
X_test_2 = dp[num_train:]
X_test_3 = dt[num_train:]
X_train_all = np.hstack((X, X_2, X_3))
X_test_all = np.hstack((X_test, X_test_2, X_test_3))
#print X_train_all[1, :15]
# <codecell>
csvf = open('X_train_all.csv', 'w')
writer = csv.writer(csvf)
writer.writerows(X_train_all)
csvf.close
# <codecell>
csvf = open('X_test_all.csv', 'w')
writer = csv.writer(csvf)
writer.writerows(X_test_all)
csvf.close
# <codecell>
num_features = X_train_all.shape[1]
print "num_features:", num_features
model = linear_model.LogisticRegression()
# <codecell>
Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
# <codecell>
print "Performing greedy feature selection..."
score_hist = []
N = 10
#good_features = set([])
good_features = set([0, 8, 9, 10, 12, 19, 34, 36, 37, 38, 42, 43, 47, 53, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85])
# Greedy feature selection loop
#ずっと続ける 最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
scores = []
#good説明変数以外の変数を足して交差検定してみる
for f in range(len(Xts)):
if f not in good_features:
feats = list(good_features) + [f] #新しく説明変数を追加
Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
score = cv_loop(Xt, y, model, N)
scores.append((score, f))
print "Feature: %i Mean AUC: %f" % (f, score) #i番目の変数を説明変数に加えたときのAUC
good_features.add(sorted(scores)[-1][1]) #一番よかった変数をgood説明変数に追加
score_hist.append(sorted(scores)[-1]) #よかったスコアと変数を記録
print "Current features: %s" % sorted(list(good_features))
print "-- -- -- -- --"
# Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment