okay001/amazon_featureSelect.py

## amazon_featureSelect.py
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>

# <codecell>

from numpy import array, hstack
from sklearn import metrics, cross_validation, linear_model
from scipy import sparse
from itertools import combinations
import numpy as np
import pandas as pd
import csv

SEED = 25

# <codecell>

def group_data(data, degree=3, hash=hash):
    """
    numpy.array -> numpy.array

    numpy.array型のリストを返す
    Groups all columns of data into all combinations of triples
    """
    #空のリストを作成
    new_data = []
    #m:dataの行(32769) n:dataの列(10)
    m,n = data.shape
    print "8C", degree
    for indicies in combinations(range(n), degree):
        print indicies
        new_data.append([hash(tuple(v)) for v in data[:,indicies]])
    return array(new_data).T   #転置はnumpy.array型で有効

# <codecell>

def OneHotEncoder(data, keymap=None):
     """
     OneHotEncoder takes data matrix with categorical columns and
     converts it to a sparse binary matrix.
     カテゴリ列をもったデータ行列　→　疎なバイナリ行列に変換

     Returns sparse binary matrix and keymap mapping categories to indicies.
     If a keymap is supplied on input it will be used instead of creating one
     and any categories appearing in the data that are not in the keymap are
     ignored
     返り値　　疎なバイナリと、添字とカテゴリに対応するキーマッピング
     インプットデータにキーマップが存在するなら、１を作る代わりにそれを使用し、キーマップに含まれてない
     データに現れる任意のカテゴリは無視される
     """

     if keymap is None:
          keymap = []
          for col in data.T:
               uniques = set(list(col))
               keymap.append(dict((key, i) for i, key in enumerate(uniques)))
     total_pts = data.shape[0]
     outdat = []
     for i, col in enumerate(data.T):
          km = keymap[i]
          num_labels = len(km)
          spmat = sparse.lil_matrix((total_pts, num_labels))
          for j, val in enumerate(col):
               if val in km:
                    spmat[j, km[val]] = 1
          outdat.append(spmat)
     outdat = sparse.hstack(outdat).tocsr()
     return outdat, keymap

# <codecell>

# This loop essentially from Paul's starter code
def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20,
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        # print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N

# <codecell>

train = 'data/train.csv'
test = 'data/test.csv'
train_data = pd.read_csv(train)
test_data = pd.read_csv(test)

#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
num_train = np.shape(train_data)[0]

#print "train_data.shape:", train_data.shape
#print train_data.ix[:1,]
#print "all_data.shape:", all_data.shape
#print all_data[:2,]

print "Transforming data…"
dp = group_data(all_data, degree=2)   #2種類ずつグルーピングした2つのデータ列
dt = group_data(all_data, degree=3)   #3種類ずつグルーピングしたデータ列

y = array(train_data.ACTION)

X = all_data[:num_train]    # ~学習用データ行まで
X_2 = dp[:num_train]
X_3 = dt[:num_train]

X_test = all_data[num_train:]   #テスト用データ行 ~
X_test_2 = dp[num_train:]
X_test_3 = dt[num_train:]

X_train_all = np.hstack((X, X_2, X_3))
X_test_all = np.hstack((X_test, X_test_2, X_test_3))

#print X_train_all[1, :15]


# <codecell>

csvf = open('X_train_all.csv', 'w')
writer = csv.writer(csvf)
writer.writerows(X_train_all)
csvf.close

# <codecell>

csvf = open('X_test_all.csv', 'w')
writer = csv.writer(csvf)
writer.writerows(X_test_all)
csvf.close

# <codecell>

num_features = X_train_all.shape[1]
print "num_features:", num_features

model = linear_model.LogisticRegression()

# <codecell>

Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]

# <codecell>

print "Performing greedy feature selection..."
score_hist = []
N = 10
#good_features = set([])
good_features = set([0, 8, 9, 10, 12, 19, 34, 36, 37, 38, 42, 43, 47, 53, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85])
# Greedy feature selection loop

#ずっと続ける 最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
    scores = []

    #good説明変数以外の変数を足して交差検定してみる
    for f in range(len(Xts)):
        if f not in good_features:
            feats = list(good_features) + [f]                    #新しく説明変数を追加
            Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
            score = cv_loop(Xt, y, model, N)
            scores.append((score, f))
            print "Feature: %i Mean AUC: %f" % (f, score)   #i番目の変数を説明変数に加えたときのAUC
    good_features.add(sorted(scores)[-1][1])      #一番よかった変数をgood説明変数に追加
    score_hist.append(sorted(scores)[-1])         #よかったスコアと変数を記録
    print "Current features: %s" % sorted(list(good_features))
    print "--    --      --     --       --"

     # Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features

## amazon_logistic.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              amazon_logistic.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	# -- coding: utf-8 --
	# <nbformat>3.0</nbformat>

	# <codecell>

	from numpy import array, hstack
	from sklearn import metrics, cross_validation, linear_model
	from scipy import sparse
	from itertools import combinations
	import numpy as np
	import pandas as pd
	import csv

	SEED = 25

	# <codecell>

	def group_data(data, degree=3, hash=hash):
	"""
	numpy.array -> numpy.array

	numpy.array型のリストを返す
	Groups all columns of data into all combinations of triples
	"""
	#空のリストを作成
	new_data = []
	#m:dataの行(32769) n:dataの列(10)
	m,n = data.shape
	print "8C", degree
	for indicies in combinations(range(n), degree):
	print indicies
	new_data.append([hash(tuple(v)) for v in data[:,indicies]])
	return array(new_data).T #転置はnumpy.array型で有効

	# <codecell>

	def OneHotEncoder(data, keymap=None):
	"""
	OneHotEncoder takes data matrix with categorical columns and
	converts it to a sparse binary matrix.
	カテゴリ列をもったデータ行列　→　疎なバイナリ行列に変換

	Returns sparse binary matrix and keymap mapping categories to indicies.
	If a keymap is supplied on input it will be used instead of creating one
	and any categories appearing in the data that are not in the keymap are
	ignored
	返り値　　疎なバイナリと、添字とカテゴリに対応するキーマッピング
	インプットデータにキーマップが存在するなら、１を作る代わりにそれを使用し、キーマップに含まれてない
	データに現れる任意のカテゴリは無視される
	"""

	if keymap is None:
	keymap = []
	for col in data.T:
	uniques = set(list(col))
	keymap.append(dict((key, i) for i, key in enumerate(uniques)))
	total_pts = data.shape[0]
	outdat = []
	for i, col in enumerate(data.T):
	km = keymap[i]
	num_labels = len(km)
	spmat = sparse.lil_matrix((total_pts, num_labels))
	for j, val in enumerate(col):
	if val in km:
	spmat[j, km[val]] = 1
	outdat.append(spmat)
	outdat = sparse.hstack(outdat).tocsr()
	return outdat, keymap

	# <codecell>

	# This loop essentially from Paul's starter code
	def cv_loop(X, y, model, N):
	mean_auc = 0.
	for i in range(N):
	X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
	X, y, test_size=.20,
	random_state = i*SEED)
	model.fit(X_train, y_train)
	preds = model.predict_proba(X_cv)[:,1]
	auc = metrics.auc_score(y_cv, preds)
	# print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
	mean_auc += auc
	return mean_auc/N

	# <codecell>

	train = 'data/train.csv'
	test = 'data/test.csv'
	train_data = pd.read_csv(train)
	test_data = pd.read_csv(test)

	#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる
	all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
	num_train = np.shape(train_data)[0]

	#print "train_data.shape:", train_data.shape
	#print train_data.ix[:1,]
	#print "all_data.shape:", all_data.shape
	#print all_data[:2,]

	print "Transforming data…"
	dp = group_data(all_data, degree=2) #2種類ずつグルーピングした2つのデータ列
	dt = group_data(all_data, degree=3) #3種類ずつグルーピングしたデータ列

	y = array(train_data.ACTION)

	X = all_data[:num_train] # ~学習用データ行まで
	X_2 = dp[:num_train]
	X_3 = dt[:num_train]

	X_test = all_data[num_train:] #テスト用データ行 ~
	X_test_2 = dp[num_train:]
	X_test_3 = dt[num_train:]

	X_train_all = np.hstack((X, X_2, X_3))
	X_test_all = np.hstack((X_test, X_test_2, X_test_3))

	#print X_train_all[1, :15]


	# <codecell>

	csvf = open('X_train_all.csv', 'w')
	writer = csv.writer(csvf)
	writer.writerows(X_train_all)
	csvf.close

	# <codecell>

	csvf = open('X_test_all.csv', 'w')
	writer = csv.writer(csvf)
	writer.writerows(X_test_all)
	csvf.close

	# <codecell>

	num_features = X_train_all.shape[1]
	print "num_features:", num_features

	model = linear_model.LogisticRegression()

	# <codecell>

	Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]

	# <codecell>

	print "Performing greedy feature selection..."
	score_hist = []
	N = 10
	#good_features = set([])
	good_features = set([0, 8, 9, 10, 12, 19, 34, 36, 37, 38, 42, 43, 47, 53, 60, 61, 63, 64, 67, 69, 71, 75, 81, 82, 85])
	# Greedy feature selection loop

	#ずっと続ける最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける
	while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
	scores = []

	#good説明変数以外の変数を足して交差検定してみる
	for f in range(len(Xts)):
	if f not in good_features:
	feats = list(good_features) + [f] #新しく説明変数を追加
	Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
	score = cv_loop(Xt, y, model, N)
	scores.append((score, f))
	print "Feature: %i Mean AUC: %f" % (f, score) #i番目の変数を説明変数に加えたときのAUC
	good_features.add(sorted(scores)[-1][1]) #一番よかった変数をgood説明変数に追加
	score_hist.append(sorted(scores)[-1]) #よかったスコアと変数を記録
	print "Current features: %s" % sorted(list(good_features))
	print "-- -- -- -- --"

	# Remove last added feature from good_features
	good_features.remove(score_hist[-1][1])
	good_features = sorted(list(good_features))
	print "Selected features %s" % good_features