Created
July 23, 2016 03:54
-
-
Save theSage21/daf833e3d3499dc287b7e144cfd9b111 to your computer and use it in GitHub Desktop.
Random Rotation Forest implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on | |
# https://www.packtpub.com/books/content/rotation-forest-classifier-ensemble-based-feature-extraction | |
from sklearn.datasets import make_classification | |
from sklearn.metrics import classification_report | |
from sklearn.cross_validation import train_test_split | |
from sklearn.decomposition import PCA | |
from sklearn.tree import DecisionTreeClassifier | |
import numpy as np | |
def get_data(): | |
""" | |
Make a sample classification dataset | |
Returns : Independent variable y, dependent variable x | |
""" | |
no_features = 50 | |
redundant_features = int(0.1*no_features) | |
informative_features = int(0.6*no_features) | |
repeated_features = int(0.1*no_features) | |
x,y = make_classification(n_samples=500,n_features=no_features,flip_y=0.03,\ | |
n_informative = informative_features, n_redundant = redundant_features \ | |
,n_repeated = repeated_features,random_state=7) | |
return x,y | |
def get_random_subset(iterable,k): | |
subsets = [] | |
iteration = 0 | |
np.random.shuffle(iterable) | |
subset = 0 | |
limit = len(iterable)/k | |
while iteration < limit: | |
if k <= len(iterable): | |
subset = k | |
else: | |
subset = len(iterable) | |
subsets.append(iterable[-subset:]) | |
del iterable[-subset:] | |
iteration+=1 | |
return subsets | |
def build_rotationtree_model(x_train,y_train,d,k): | |
models = [] | |
r_matrices = [] | |
feature_subsets = [] | |
for i in range(d): | |
x,_,_,_ = train_test_split(x_train,y_train,test_size=0.3,random_state=7) | |
# Features ids | |
feature_index = list(range(x.shape[1])) | |
# Get subsets of features | |
random_k_subset = get_random_subset(feature_index,k) | |
feature_subsets.append(random_k_subset) | |
# Rotation matrix | |
R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float) | |
for each_subset in random_k_subset: | |
pca = PCA() | |
x_subset = x[:,each_subset] | |
pca.fit(x_subset) | |
for ii in range(0,len(pca.components_)): | |
for jj in range(0,len(pca.components_)): | |
R_matrix[each_subset[ii],each_subset[jj]] = pca.components_[ii,jj] | |
x_transformed = x_train.dot(R_matrix) | |
model = DecisionTreeClassifier() | |
model.fit(x_transformed,y_train) | |
models.append(model) | |
r_matrices.append(R_matrix) | |
return models,r_matrices,feature_subsets | |
def model_worth(models,r_matrices,x,y): | |
predicted_ys = [] | |
for i,model in enumerate(models): | |
x_mod = x.dot(r_matrices[i]) | |
predicted_y = model.predict(x_mod) | |
predicted_ys.append(predicted_y) | |
predicted_matrix = np.asmatrix(predicted_ys) | |
final_prediction = [] | |
for i in range(len(y)): | |
pred_from_all_models = np.ravel(predicted_matrix[:,i]) | |
non_zero_pred = np.nonzero(pred_from_all_models)[0] | |
is_one = len(non_zero_pred) > len(models)/2 | |
final_prediction.append(is_one) | |
print(classification_report(y, final_prediction)) | |
if __name__ == "__main__": | |
x,y = get_data() | |
# plot_data(x,y) | |
# Divide the data into Train, dev and test | |
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size = 0.3,random_state=9) | |
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9) | |
# Build a bag of models | |
models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5) | |
model_worth(models,r_matrices,x_train,y_train) | |
model_worth(models,r_matrices,x_dev,y_dev) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment