ychennay/rf_proximity.py

## rf_proximity.py
from sklearn.ensemble import RandomForestClassifier

B = 500
rf = RandomForestClassifier(n_estimators=B)
rf.fit(X, y)
# apply() runs the each data point through the bootstrapped tree models, recording their final terminal leaf node indices
final_positions = rf.apply(X) # final positions will be N x 500 (N rows, one per data point, and 500 columns, one per tree)
proximity_matrix = np.zeros((len(X), len(X))) # proximity matrix is N x N
# I've adapted implementation found here:
# https://stackoverflow.com/questions/18703136/proximity-matrix-in-sklearn-ensemble-randomforestclassifier
for tree_idx in range(B):
    proximity_matrix += np.equal.outer(final_positions[:,tree_idx],
                                       final_positions[:,tree_idx]).astype(float)
# divide by the # of estimators to normalize
proximity_matrix /= B
distance_matrix = pd.DataFrame(1 - proximity_matrix, columns=ad_ids, index=ad_ids) # convert to distance matrix dataframe

#stack the matrix so we can see the closest Ad Ids to each other
distances_stacked = distance_matrix.stack().reset_index()
#set column names
distances_stacked.columns = ['Ad 1','Ad 2','Distance']
# filter for only non-zero distances (we don't care if an ad is close to itself!)
distances_stacked = distances_stacked[distances_stacked["Distance"] > 0].sort_values(by=['Distance'])
# get the top few closest pairs of ads to each other
closest_ad_ids = distances_stacked.head(2).values[:,:-1]
raw_df[raw_df["ad_id"].isin(closest_ad_ids.flatten())]
	from sklearn.ensemble import RandomForestClassifier

	B = 500
	rf = RandomForestClassifier(n_estimators=B)
	rf.fit(X, y)
	# apply() runs the each data point through the bootstrapped tree models, recording their final terminal leaf node indices
	final_positions = rf.apply(X) # final positions will be N x 500 (N rows, one per data point, and 500 columns, one per tree)
	proximity_matrix = np.zeros((len(X), len(X))) # proximity matrix is N x N
	# I've adapted implementation found here:
	# https://stackoverflow.com/questions/18703136/proximity-matrix-in-sklearn-ensemble-randomforestclassifier
	for tree_idx in range(B):
	proximity_matrix += np.equal.outer(final_positions[:,tree_idx],
	final_positions[:,tree_idx]).astype(float)
	# divide by the # of estimators to normalize
	proximity_matrix /= B
	distance_matrix = pd.DataFrame(1 - proximity_matrix, columns=ad_ids, index=ad_ids) # convert to distance matrix dataframe

	#stack the matrix so we can see the closest Ad Ids to each other
	distances_stacked = distance_matrix.stack().reset_index()
	#set column names
	distances_stacked.columns = ['Ad 1','Ad 2','Distance']
	# filter for only non-zero distances (we don't care if an ad is close to itself!)
	distances_stacked = distances_stacked[distances_stacked["Distance"] > 0].sort_values(by=['Distance'])
	# get the top few closest pairs of ads to each other
	closest_ad_ids = distances_stacked.head(2).values[:,:-1]
	raw_df[raw_df["ad_id"].isin(closest_ad_ids.flatten())]