Skip to content

Instantly share code, notes, and snippets.

@elisno
elisno / test_scores.csv
Last active January 15, 2024 17:19
OOD feature-score rescaling results
test_scores_before test_scores_after
0.682005 0.33194867
0.7028846 0.36208245
0.6047957 0.2348125
0.7025019 0.3615146
0.6364814 0.2720337
0.7082397 0.3700882
0.7867108 0.5009551
0.8309417 0.58647346
0.6733773 0.319992
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@elisno
elisno / 860497b2-7a68-4f2b-ac04-4ca9cf3dfd2f.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling14
# Plot two figures
plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
# LEFT PLOT
# Plot scatter plot of umap embeddings with clusterer labels as colors
x_plot, y_plot = selected_outlier_subset_embeddings_umap[:, 0], selected_outlier_subset_embeddings_umap[:, 1]
plt.subplot(1, 2, 1)
for i, topic in enumerate(np.unique(cluster_labels)):
@elisno
elisno / ec3b107a-2c3c-4aa2-b493-343ff2de8130.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling13
n = 7
top_idx = np.argpartition(tf_idf, -n)[:, -n:]
vlist = list(vocab)
for c, _class in enumerate(classes.keys()):
topn_idx = top_idx[c, :]
topn_terms = [vlist[idx] for idx in topn_idx]
if _class != -1:
print(f"Topic class {_class}: {topn_terms}")
else:
@elisno
elisno / e2bd6f74-3375-4bd5-99b9-d4d650abb8bd.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling12
###### Create documents from sentence pairs
# Get combined text from the selected outliers
# Joining the premise and hypothesis together
def join_sentence_pair(example):
docs = []
for premise, hypothesis in zip(example["premise"], example["hypothesis"]):
docs.append(premise + " " + hypothesis)
example["docs"] = docs
return example
@elisno
elisno / bd70c935-47b9-4085-a7e8-543b892cd8fe.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling11
clusterer = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=4)
clusterer.fit(selected_outlier_subset_embeddings_umap)
cluster_labels = clusterer.labels_
clusterer.condensed_tree_.plot(select_clusters=True)
# plot each set of points in a different color
plt.figure(figsize=(10, 10))
for i in np.unique(cluster_labels):
if i != -1:
@elisno
elisno / cd5acf45-bd48-484c-aa07-2420fc037cb1.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling10
# Get embeddings of selected outliers
selected_outlier_subset_embeddings = test_feature_embeddings[outlier_ids]
# Reduce dimensionality with UMAP
umap_fit = UMAP(n_components=2, n_neighbors=8, random_state=SEED)
selected_outlier_subset_embeddings_umap = umap_fit.fit_transform(selected_outlier_subset_embeddings)
# Set plot labels
mismatched_labels = {"nineeleven": 0, "facetoface": 1, "letters": 2, "oup": 3, "verbatim": 4}
matched_labels = {"fiction": 5, "government": 6, "slate": 7, "telephone": 8, "travel": 9}
@elisno
elisno / 3c225fb6-d82d-4689-b5b6-3e2f2e8c3c5d.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling9
# Get embeddings of test examples whose outlier scores are below the threshold
sorted_ids = test_outlier_scores.argsort()
outlier_scores = test_outlier_scores[sorted_ids]
outlier_ids = sorted_ids[outlier_scores < threshold]
selected_outlier_subset = test_data.select(outlier_ids)
selected_outlier_subset.to_pandas().tail(15)
@elisno
elisno / 5c7fd272-e25b-495e-8da8-af4599a39855.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling8
# Take the 2.5th percentile of the outlier scores in the training data as the threshold
threshold = np.percentile(test_outlier_scores, 2.5)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
plt_range = [min(train_outlier_scores.min(),test_outlier_scores.min()), \
max(train_outlier_scores.max(),test_outlier_scores.max())]
axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency')
axes[0].axvline(x=threshold, color='red', linewidth=2)
@elisno
elisno / dfe14ba7-028d-405d-90a7-31fb526d3aa4.py
Created September 30, 2022 16:02
understanding_outliers_in_text_data_with_transformers,_cleanlab,_and_topic_modeling7
# Visualize 15 least severe outliers in test data
bottom_outlier_idxs = (-test_outlier_scores).argsort()[:20]
bottom_outlier_subset = test_data.select(bottom_outlier_idxs)
bottom_outlier_subset.to_pandas()