This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_scores_before | test_scores_after | |
---|---|---|
0.682005 | 0.33194867 | |
0.7028846 | 0.36208245 | |
0.6047957 | 0.2348125 | |
0.7025019 | 0.3615146 | |
0.6364814 | 0.2720337 | |
0.7082397 | 0.3700882 | |
0.7867108 | 0.5009551 | |
0.8309417 | 0.58647346 | |
0.6733773 | 0.319992 |
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Plot two figures | |
plt.subplots(nrows=1, ncols=2, figsize=(20, 10)) | |
# LEFT PLOT | |
# Plot scatter plot of umap embeddings with clusterer labels as colors | |
x_plot, y_plot = selected_outlier_subset_embeddings_umap[:, 0], selected_outlier_subset_embeddings_umap[:, 1] | |
plt.subplot(1, 2, 1) | |
for i, topic in enumerate(np.unique(cluster_labels)): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
n = 7 | |
top_idx = np.argpartition(tf_idf, -n)[:, -n:] | |
vlist = list(vocab) | |
for c, _class in enumerate(classes.keys()): | |
topn_idx = top_idx[c, :] | |
topn_terms = [vlist[idx] for idx in topn_idx] | |
if _class != -1: | |
print(f"Topic class {_class}: {topn_terms}") | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### Create documents from sentence pairs | |
# Get combined text from the selected outliers | |
# Joining the premise and hypothesis together | |
def join_sentence_pair(example): | |
docs = [] | |
for premise, hypothesis in zip(example["premise"], example["hypothesis"]): | |
docs.append(premise + " " + hypothesis) | |
example["docs"] = docs | |
return example |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clusterer = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=4) | |
clusterer.fit(selected_outlier_subset_embeddings_umap) | |
cluster_labels = clusterer.labels_ | |
clusterer.condensed_tree_.plot(select_clusters=True) | |
# plot each set of points in a different color | |
plt.figure(figsize=(10, 10)) | |
for i in np.unique(cluster_labels): | |
if i != -1: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get embeddings of selected outliers | |
selected_outlier_subset_embeddings = test_feature_embeddings[outlier_ids] | |
# Reduce dimensionality with UMAP | |
umap_fit = UMAP(n_components=2, n_neighbors=8, random_state=SEED) | |
selected_outlier_subset_embeddings_umap = umap_fit.fit_transform(selected_outlier_subset_embeddings) | |
# Set plot labels | |
mismatched_labels = {"nineeleven": 0, "facetoface": 1, "letters": 2, "oup": 3, "verbatim": 4} | |
matched_labels = {"fiction": 5, "government": 6, "slate": 7, "telephone": 8, "travel": 9} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get embeddings of test examples whose outlier scores are below the threshold | |
sorted_ids = test_outlier_scores.argsort() | |
outlier_scores = test_outlier_scores[sorted_ids] | |
outlier_ids = sorted_ids[outlier_scores < threshold] | |
selected_outlier_subset = test_data.select(outlier_ids) | |
selected_outlier_subset.to_pandas().tail(15) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Take the 2.5th percentile of the outlier scores in the training data as the threshold | |
threshold = np.percentile(test_outlier_scores, 2.5) | |
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5)) | |
plt_range = [min(train_outlier_scores.min(),test_outlier_scores.min()), \ | |
max(train_outlier_scores.max(),test_outlier_scores.max())] | |
axes[0].hist(train_outlier_scores, range=plt_range, bins=50) | |
axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency') | |
axes[0].axvline(x=threshold, color='red', linewidth=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Visualize 15 least severe outliers in test data | |
bottom_outlier_idxs = (-test_outlier_scores).argsort()[:20] | |
bottom_outlier_subset = test_data.select(bottom_outlier_idxs) | |
bottom_outlier_subset.to_pandas() |
NewerOlder