theSage21/rot_forest.py

## rot_forest.py
# Based on
# https://www.packtpub.com/books/content/rotation-forest-classifier-ensemble-based-feature-extraction
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def get_data():
    """
    Make a sample classification dataset
    Returns : Independent variable y, dependent variable x
    """
    no_features = 50
    redundant_features = int(0.1*no_features)
    informative_features = int(0.6*no_features)
    repeated_features = int(0.1*no_features)
    x,y = make_classification(n_samples=500,n_features=no_features,flip_y=0.03,\
            n_informative = informative_features, n_redundant = redundant_features \
            ,n_repeated = repeated_features,random_state=7)
    return x,y

def get_random_subset(iterable,k):
    subsets = []
    iteration = 0
    np.random.shuffle(iterable)
    subset = 0
    limit = len(iterable)/k
    while iteration < limit:
        if k <= len(iterable):
            subset = k
        else:
            subset = len(iterable)
        subsets.append(iterable[-subset:])
        del iterable[-subset:]
        iteration+=1
    return subsets

def build_rotationtree_model(x_train,y_train,d,k):
    models = []
    r_matrices = []
    feature_subsets = []
    for i in range(d):
        x,_,_,_ = train_test_split(x_train,y_train,test_size=0.3,random_state=7)
        # Features ids
        feature_index = list(range(x.shape[1]))
        # Get subsets of features
        random_k_subset = get_random_subset(feature_index,k)
        feature_subsets.append(random_k_subset)
        # Rotation matrix
        R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float)
        for each_subset in random_k_subset:
            pca = PCA()
            x_subset = x[:,each_subset]
            pca.fit(x_subset)
            for ii in range(0,len(pca.components_)):
                for jj in range(0,len(pca.components_)):
                    R_matrix[each_subset[ii],each_subset[jj]] = pca.components_[ii,jj]

        x_transformed = x_train.dot(R_matrix)

        model = DecisionTreeClassifier()
        model.fit(x_transformed,y_train)
        models.append(model)
        r_matrices.append(R_matrix)
    return models,r_matrices,feature_subsets

def model_worth(models,r_matrices,x,y):

    predicted_ys = []
    for i,model in enumerate(models):
        x_mod =  x.dot(r_matrices[i])
        predicted_y = model.predict(x_mod)
        predicted_ys.append(predicted_y)

    predicted_matrix = np.asmatrix(predicted_ys)
    final_prediction = []
    for i in range(len(y)):
        pred_from_all_models = np.ravel(predicted_matrix[:,i])
        non_zero_pred = np.nonzero(pred_from_all_models)[0]
        is_one = len(non_zero_pred) > len(models)/2
        final_prediction.append(is_one)

    print(classification_report(y, final_prediction))

if __name__ == "__main__":
    x,y = get_data()
#    plot_data(x,y)

    # Divide the data into Train, dev and test
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size = 0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)

    # Build a bag of models
    models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5)
    model_worth(models,r_matrices,x_train,y_train)
    model_worth(models,r_matrices,x_dev,y_dev)
	# Based on
	# https://www.packtpub.com/books/content/rotation-forest-classifier-ensemble-based-feature-extraction
	from sklearn.datasets import make_classification
	from sklearn.metrics import classification_report
	from sklearn.cross_validation import train_test_split
	from sklearn.decomposition import PCA
	from sklearn.tree import DecisionTreeClassifier
	import numpy as np

	def get_data():
	"""
	Make a sample classification dataset
	Returns : Independent variable y, dependent variable x
	"""
	no_features = 50
	redundant_features = int(0.1*no_features)
	informative_features = int(0.6*no_features)
	repeated_features = int(0.1*no_features)
	x,y = make_classification(n_samples=500,n_features=no_features,flip_y=0.03,\
	n_informative = informative_features, n_redundant = redundant_features \
	,n_repeated = repeated_features,random_state=7)
	return x,y

	def get_random_subset(iterable,k):
	subsets = []
	iteration = 0
	np.random.shuffle(iterable)
	subset = 0
	limit = len(iterable)/k
	while iteration < limit:
	if k <= len(iterable):
	subset = k
	else:
	subset = len(iterable)
	subsets.append(iterable[-subset:])
	del iterable[-subset:]
	iteration+=1
	return subsets

	def build_rotationtree_model(x_train,y_train,d,k):
	models = []
	r_matrices = []
	feature_subsets = []
	for i in range(d):
	x,_,_,_ = train_test_split(x_train,y_train,test_size=0.3,random_state=7)
	# Features ids
	feature_index = list(range(x.shape[1]))
	# Get subsets of features
	random_k_subset = get_random_subset(feature_index,k)
	feature_subsets.append(random_k_subset)
	# Rotation matrix
	R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float)
	for each_subset in random_k_subset:
	pca = PCA()
	x_subset = x[:,each_subset]
	pca.fit(x_subset)
	for ii in range(0,len(pca.components_)):
	for jj in range(0,len(pca.components_)):
	R_matrix[each_subset[ii],each_subset[jj]] = pca.components_[ii,jj]

	x_transformed = x_train.dot(R_matrix)

	model = DecisionTreeClassifier()
	model.fit(x_transformed,y_train)
	models.append(model)
	r_matrices.append(R_matrix)
	return models,r_matrices,feature_subsets

	def model_worth(models,r_matrices,x,y):

	predicted_ys = []
	for i,model in enumerate(models):
	x_mod = x.dot(r_matrices[i])
	predicted_y = model.predict(x_mod)
	predicted_ys.append(predicted_y)

	predicted_matrix = np.asmatrix(predicted_ys)
	final_prediction = []
	for i in range(len(y)):
	pred_from_all_models = np.ravel(predicted_matrix[:,i])
	non_zero_pred = np.nonzero(pred_from_all_models)[0]
	is_one = len(non_zero_pred) > len(models)/2
	final_prediction.append(is_one)

	print(classification_report(y, final_prediction))

	if __name__ == "__main__":
	x,y = get_data()
	# plot_data(x,y)

	# Divide the data into Train, dev and test
	x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size = 0.3,random_state=9)
	x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)

	# Build a bag of models
	models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5)
	model_worth(models,r_matrices,x_train,y_train)
	model_worth(models,r_matrices,x_dev,y_dev)