Last active
May 20, 2019 02:04
-
-
Save ychennay/5bceb4e814239222c0082839107043e7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
B = 500 | |
rf = RandomForestClassifier(n_estimators=B) | |
rf.fit(X, y) | |
# apply() runs the each data point through the bootstrapped tree models, recording their final terminal leaf node indices | |
final_positions = rf.apply(X) # final positions will be N x 500 (N rows, one per data point, and 500 columns, one per tree) | |
proximity_matrix = np.zeros((len(X), len(X))) # proximity matrix is N x N | |
# I've adapted implementation found here: | |
# https://stackoverflow.com/questions/18703136/proximity-matrix-in-sklearn-ensemble-randomforestclassifier | |
for tree_idx in range(B): | |
proximity_matrix += np.equal.outer(final_positions[:,tree_idx], | |
final_positions[:,tree_idx]).astype(float) | |
# divide by the # of estimators to normalize | |
proximity_matrix /= B | |
distance_matrix = pd.DataFrame(1 - proximity_matrix, columns=ad_ids, index=ad_ids) # convert to distance matrix dataframe | |
#stack the matrix so we can see the closest Ad Ids to each other | |
distances_stacked = distance_matrix.stack().reset_index() | |
#set column names | |
distances_stacked.columns = ['Ad 1','Ad 2','Distance'] | |
# filter for only non-zero distances (we don't care if an ad is close to itself!) | |
distances_stacked = distances_stacked[distances_stacked["Distance"] > 0].sort_values(by=['Distance']) | |
# get the top few closest pairs of ads to each other | |
closest_ad_ids = distances_stacked.head(2).values[:,:-1] | |
raw_df[raw_df["ad_id"].isin(closest_ad_ids.flatten())] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment