BenjaminFraser/random_forest.py

## random_forest.py
class RandomForest():
    """ Python implementation of a random forest regressor """
    def __init__(self, x, y, num_trees, sample_size, feature_proportion=1.0,
                 min_leaf=5, bootstrap=False, random_seed=12):
        np.random.seed(random_seed)
        self.x = x
        self.y = y
        self.num_trees = num_trees
        self.sample_size = sample_size
        self.feature_proportion = feature_proportion
        self.min_leaf = min_leaf
        self.bootstrap = bootstrap
        self.trees = [self.create_tree(bootstrap) for i in range(num_trees)]


    def create_tree(self, bootstrap=False):
        """ Form individual decision tree """

        # obtain a random sample of indices and identify oob samples
        idxs = np.random.permutation(self.y.shape[0])[:self.sample_size]

        oob_idxs = None

        # if bootstrap chosen get bootstrap sample and oob indexes
        if bootstrap:
            idxs, oob_idxs = self.bootstrap_samples(idxs)

        return DecisionTree(self.x.iloc[idxs], self.y[idxs],
                            feat_proportion=self.feature_proportion,
                            idxs=np.array(range(self.sample_size)),
                            oob_idxs=oob_idxs,
                            min_leaf=self.min_leaf)


    def predict(self, x):
        """ Return the mean of predictions across trees """
        # call predict function from each Tree class
        return np.mean([t.predict(x) for t in self.trees], axis=0)


    def oob_score(self):
        """ Calculate and return each tree OOB R2 score and the average
            OOB score across all decision trees """

        tree_oob_scores = []

        # find oob score for each tree and append to results
        for tree in self.trees:

            # find current tree oob predictions and labels
            tree_oob_labels = self.y[tree.oob_idxs]
            tree_oob_preds = tree.predict(self.x.iloc[tree.oob_idxs].values)

            # calculate R2 score for predictions on current tree
            tree_oob_r2 = r2_score(tree_oob_labels, tree_oob_preds)

            # add R2 score for oob predictions from this tree
            tree_oob_scores.append(tree_oob_r2)

        tree_oob_scores = np.array(tree_oob_scores)

        # find average oob scores across all trees
        avg_oob_score = np.mean(tree_oob_scores)

        return tree_oob_scores, avg_oob_score


    def bootstrap_samples(self, idxs):
        """ Return bootstrapped sample indices based on y and sample size """

        # take sample (with replacement) of idxs and set as bootstrap sample
        sample_idxs = np.random.randint(0, len(idxs), size=self.sample_size)
        bootstrap_idxs = idxs[sample_idxs]

        # find out-of-bag (OOB) samples from the passed idxs array
        i = np.arange(self.sample_size)
        oob_i = np.array([ind for ind in i if ind not in sample_idxs])
        oob_idxs = idxs[oob_i]

        return bootstrap_idxs, oob_idxs


    def feature_importances(self):
        """ Find the feature importances by shuffling each feature
            and finding the drop in score relative to baseline. """

        # find baseline r2 score - all features will compare against this
        baseline_score = r2_score(self.y, self.predict(self.x.values))

        # dictionary to store feature importances
        feat_importances = {}
        columns = self.x.columns

        # iterate through each column, shuffle and get new score
        for feat_column in columns:

            # shuffle only current column
            temp_df = self.x.copy()
            feat_vals = temp_df[feat_column].values
            np.random.shuffle(feat_vals)

            # find new R2 score with shuffled feature
            shuffled_score = r2_score(self.y, self.predict(temp_df.values))

            # calculate how much score has changed - this represents importance
            feat_score = (baseline_score - shuffled_score) / baseline_score

            # add to importance dict
            feat_importances[feat_column] = feat_score

        importance_df = pd.DataFrame.from_dict(feat_importances,
                                               orient='index',
                                               columns=['Importance'])

        return importance_df.sort_values('Importance', ascending=False)
	class RandomForest():
	""" Python implementation of a random forest regressor """
	def __init__(self, x, y, num_trees, sample_size, feature_proportion=1.0,
	min_leaf=5, bootstrap=False, random_seed=12):
	np.random.seed(random_seed)
	self.x = x
	self.y = y
	self.num_trees = num_trees
	self.sample_size = sample_size
	self.feature_proportion = feature_proportion
	self.min_leaf = min_leaf
	self.bootstrap = bootstrap
	self.trees = [self.create_tree(bootstrap) for i in range(num_trees)]


	def create_tree(self, bootstrap=False):
	""" Form individual decision tree """

	# obtain a random sample of indices and identify oob samples
	idxs = np.random.permutation(self.y.shape[0])[:self.sample_size]

	oob_idxs = None

	# if bootstrap chosen get bootstrap sample and oob indexes
	if bootstrap:
	idxs, oob_idxs = self.bootstrap_samples(idxs)

	return DecisionTree(self.x.iloc[idxs], self.y[idxs],
	feat_proportion=self.feature_proportion,
	idxs=np.array(range(self.sample_size)),
	oob_idxs=oob_idxs,
	min_leaf=self.min_leaf)


	def predict(self, x):
	""" Return the mean of predictions across trees """
	# call predict function from each Tree class
	return np.mean([t.predict(x) for t in self.trees], axis=0)


	def oob_score(self):
	""" Calculate and return each tree OOB R2 score and the average
	OOB score across all decision trees """

	tree_oob_scores = []

	# find oob score for each tree and append to results
	for tree in self.trees:

	# find current tree oob predictions and labels
	tree_oob_labels = self.y[tree.oob_idxs]
	tree_oob_preds = tree.predict(self.x.iloc[tree.oob_idxs].values)

	# calculate R2 score for predictions on current tree
	tree_oob_r2 = r2_score(tree_oob_labels, tree_oob_preds)

	# add R2 score for oob predictions from this tree
	tree_oob_scores.append(tree_oob_r2)

	tree_oob_scores = np.array(tree_oob_scores)

	# find average oob scores across all trees
	avg_oob_score = np.mean(tree_oob_scores)

	return tree_oob_scores, avg_oob_score


	def bootstrap_samples(self, idxs):
	""" Return bootstrapped sample indices based on y and sample size """

	# take sample (with replacement) of idxs and set as bootstrap sample
	sample_idxs = np.random.randint(0, len(idxs), size=self.sample_size)
	bootstrap_idxs = idxs[sample_idxs]

	# find out-of-bag (OOB) samples from the passed idxs array
	i = np.arange(self.sample_size)
	oob_i = np.array([ind for ind in i if ind not in sample_idxs])
	oob_idxs = idxs[oob_i]

	return bootstrap_idxs, oob_idxs


	def feature_importances(self):
	""" Find the feature importances by shuffling each feature
	and finding the drop in score relative to baseline. """

	# find baseline r2 score - all features will compare against this
	baseline_score = r2_score(self.y, self.predict(self.x.values))

	# dictionary to store feature importances
	feat_importances = {}
	columns = self.x.columns

	# iterate through each column, shuffle and get new score
	for feat_column in columns:

	# shuffle only current column
	temp_df = self.x.copy()
	feat_vals = temp_df[feat_column].values
	np.random.shuffle(feat_vals)

	# find new R2 score with shuffled feature
	shuffled_score = r2_score(self.y, self.predict(temp_df.values))

	# calculate how much score has changed - this represents importance
	feat_score = (baseline_score - shuffled_score) / baseline_score

	# add to importance dict
	feat_importances[feat_column] = feat_score

	importance_df = pd.DataFrame.from_dict(feat_importances,
	orient='index',
	columns=['Importance'])

	return importance_df.sort_values('Importance', ascending=False)