Skip to content

Instantly share code, notes, and snippets.

@BenjaminFraser
Last active April 12, 2020 13:45
Show Gist options
  • Save BenjaminFraser/7d54d99c88d2cf9a38a90a276c0f43e3 to your computer and use it in GitHub Desktop.
Save BenjaminFraser/7d54d99c88d2cf9a38a90a276c0f43e3 to your computer and use it in GitHub Desktop.
Python implementation of a random forest model using Numpy
class RandomForest():
""" Python implementation of a random forest regressor """
def __init__(self, x, y, num_trees, sample_size, feature_proportion=1.0,
min_leaf=5, bootstrap=False, random_seed=12):
np.random.seed(random_seed)
self.x = x
self.y = y
self.num_trees = num_trees
self.sample_size = sample_size
self.feature_proportion = feature_proportion
self.min_leaf = min_leaf
self.bootstrap = bootstrap
self.trees = [self.create_tree(bootstrap) for i in range(num_trees)]
def create_tree(self, bootstrap=False):
""" Form individual decision tree """
# obtain a random sample of indices and identify oob samples
idxs = np.random.permutation(self.y.shape[0])[:self.sample_size]
oob_idxs = None
# if bootstrap chosen get bootstrap sample and oob indexes
if bootstrap:
idxs, oob_idxs = self.bootstrap_samples(idxs)
return DecisionTree(self.x.iloc[idxs], self.y[idxs],
feat_proportion=self.feature_proportion,
idxs=np.array(range(self.sample_size)),
oob_idxs=oob_idxs,
min_leaf=self.min_leaf)
def predict(self, x):
""" Return the mean of predictions across trees """
# call predict function from each Tree class
return np.mean([t.predict(x) for t in self.trees], axis=0)
def oob_score(self):
""" Calculate and return each tree OOB R2 score and the average
OOB score across all decision trees """
tree_oob_scores = []
# find oob score for each tree and append to results
for tree in self.trees:
# find current tree oob predictions and labels
tree_oob_labels = self.y[tree.oob_idxs]
tree_oob_preds = tree.predict(self.x.iloc[tree.oob_idxs].values)
# calculate R2 score for predictions on current tree
tree_oob_r2 = r2_score(tree_oob_labels, tree_oob_preds)
# add R2 score for oob predictions from this tree
tree_oob_scores.append(tree_oob_r2)
tree_oob_scores = np.array(tree_oob_scores)
# find average oob scores across all trees
avg_oob_score = np.mean(tree_oob_scores)
return tree_oob_scores, avg_oob_score
def bootstrap_samples(self, idxs):
""" Return bootstrapped sample indices based on y and sample size """
# take sample (with replacement) of idxs and set as bootstrap sample
sample_idxs = np.random.randint(0, len(idxs), size=self.sample_size)
bootstrap_idxs = idxs[sample_idxs]
# find out-of-bag (OOB) samples from the passed idxs array
i = np.arange(self.sample_size)
oob_i = np.array([ind for ind in i if ind not in sample_idxs])
oob_idxs = idxs[oob_i]
return bootstrap_idxs, oob_idxs
def feature_importances(self):
""" Find the feature importances by shuffling each feature
and finding the drop in score relative to baseline. """
# find baseline r2 score - all features will compare against this
baseline_score = r2_score(self.y, self.predict(self.x.values))
# dictionary to store feature importances
feat_importances = {}
columns = self.x.columns
# iterate through each column, shuffle and get new score
for feat_column in columns:
# shuffle only current column
temp_df = self.x.copy()
feat_vals = temp_df[feat_column].values
np.random.shuffle(feat_vals)
# find new R2 score with shuffled feature
shuffled_score = r2_score(self.y, self.predict(temp_df.values))
# calculate how much score has changed - this represents importance
feat_score = (baseline_score - shuffled_score) / baseline_score
# add to importance dict
feat_importances[feat_column] = feat_score
importance_df = pd.DataFrame.from_dict(feat_importances,
orient='index',
columns=['Importance'])
return importance_df.sort_values('Importance', ascending=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment