Elías Snorrason elisno

## test_scores.csv

          
            test_scores_before
            test_scores_after

            
              0.682005
              0.33194867

            
              0.7028846
              0.36208245

            
              0.6047957
              0.2348125

            
              0.7025019
              0.3615146

            
              0.6364814
              0.2720337

            
              0.7082397
              0.3700882

            
              0.7867108
              0.5009551

            
              0.8309417
              0.58647346

            
              0.6733773
              0.319992

## notebooks---datalab.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                elisno
                / notebooks---datalab.ipynb
            
            
              Created
              March 7, 2023 17:16
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## 860497b2-7a68-4f2b-ac04-4ca9cf3dfd2f.py
# Plot two figures
plt.subplots(nrows=1, ncols=2, figsize=(20, 10))


# LEFT PLOT
# Plot scatter plot of umap embeddings with clusterer labels as colors

x_plot, y_plot = selected_outlier_subset_embeddings_umap[:, 0], selected_outlier_subset_embeddings_umap[:, 1]
plt.subplot(1, 2, 1)
for i, topic in enumerate(np.unique(cluster_labels)):

## ec3b107a-2c3c-4aa2-b493-343ff2de8130.py
n = 7

top_idx = np.argpartition(tf_idf, -n)[:, -n:]
vlist = list(vocab)
for c, _class in enumerate(classes.keys()):
    topn_idx = top_idx[c, :]
    topn_terms = [vlist[idx] for idx in topn_idx]
    if _class != -1:
        print(f"Topic class {_class}: {topn_terms}")
    else:

## e2bd6f74-3375-4bd5-99b9-d4d650abb8bd.py
###### Create documents from sentence pairs

# Get combined text from the selected outliers
# Joining the premise and hypothesis together
def join_sentence_pair(example):
    docs = []
    for premise, hypothesis in zip(example["premise"], example["hypothesis"]):
        docs.append(premise + " " + hypothesis)
    example["docs"] = docs
    return example

## bd70c935-47b9-4085-a7e8-543b892cd8fe.py
clusterer = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=4)
clusterer.fit(selected_outlier_subset_embeddings_umap)
cluster_labels = clusterer.labels_

clusterer.condensed_tree_.plot(select_clusters=True)

# plot each set of points in a different color
plt.figure(figsize=(10, 10))
for i in np.unique(cluster_labels):
    if i != -1:

## cd5acf45-bd48-484c-aa07-2420fc037cb1.py
# Get embeddings of selected outliers
selected_outlier_subset_embeddings = test_feature_embeddings[outlier_ids]

# Reduce dimensionality with UMAP
umap_fit = UMAP(n_components=2, n_neighbors=8, random_state=SEED)
selected_outlier_subset_embeddings_umap = umap_fit.fit_transform(selected_outlier_subset_embeddings)

# Set plot labels
mismatched_labels = {"nineeleven": 0, "facetoface": 1, "letters": 2, "oup": 3, "verbatim": 4}
matched_labels = {"fiction": 5, "government": 6, "slate": 7, "telephone": 8, "travel": 9}

## 3c225fb6-d82d-4689-b5b6-3e2f2e8c3c5d.py
# Get embeddings of test examples whose outlier scores are below the threshold

sorted_ids = test_outlier_scores.argsort()
outlier_scores = test_outlier_scores[sorted_ids]
outlier_ids = sorted_ids[outlier_scores < threshold]

selected_outlier_subset = test_data.select(outlier_ids)
selected_outlier_subset.to_pandas().tail(15)


## 5c7fd272-e25b-495e-8da8-af4599a39855.py
# Take the 2.5th percentile of the outlier scores in the training data as the threshold
threshold = np.percentile(test_outlier_scores, 2.5)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
plt_range = [min(train_outlier_scores.min(),test_outlier_scores.min()), \
             max(train_outlier_scores.max(),test_outlier_scores.max())]

axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency')
axes[0].axvline(x=threshold, color='red', linewidth=2)

## dfe14ba7-028d-405d-90a7-31fb526d3aa4.py
# Visualize 15 least severe outliers in test data
bottom_outlier_idxs = (-test_outlier_scores).argsort()[:20]
bottom_outlier_subset = test_data.select(bottom_outlier_idxs)
bottom_outlier_subset.to_pandas()
	test_scores_before	test_scores_after
	0.682005	0.33194867
	0.7028846	0.36208245
	0.6047957	0.2348125
	0.7025019	0.3615146
	0.6364814	0.2720337
	0.7082397	0.3700882
	0.7867108	0.5009551
	0.8309417	0.58647346
	0.6733773	0.319992
	# Plot two figures
	plt.subplots(nrows=1, ncols=2, figsize=(20, 10))


	# LEFT PLOT
	# Plot scatter plot of umap embeddings with clusterer labels as colors

	x_plot, y_plot = selected_outlier_subset_embeddings_umap[:, 0], selected_outlier_subset_embeddings_umap[:, 1]
	plt.subplot(1, 2, 1)
	for i, topic in enumerate(np.unique(cluster_labels)):
	n = 7

	top_idx = np.argpartition(tf_idf, -n)[:, -n:]
	vlist = list(vocab)
	for c, _class in enumerate(classes.keys()):
	topn_idx = top_idx[c, :]
	topn_terms = [vlist[idx] for idx in topn_idx]
	if _class != -1:
	print(f"Topic class {_class}: {topn_terms}")
	else:
	###### Create documents from sentence pairs

	# Get combined text from the selected outliers
	# Joining the premise and hypothesis together
	def join_sentence_pair(example):
	docs = []
	for premise, hypothesis in zip(example["premise"], example["hypothesis"]):
	docs.append(premise + " " + hypothesis)
	example["docs"] = docs
	return example
	clusterer = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=4)
	clusterer.fit(selected_outlier_subset_embeddings_umap)
	cluster_labels = clusterer.labels_

	clusterer.condensed_tree_.plot(select_clusters=True)

	# plot each set of points in a different color
	plt.figure(figsize=(10, 10))
	for i in np.unique(cluster_labels):
	if i != -1:
	# Get embeddings of selected outliers
	selected_outlier_subset_embeddings = test_feature_embeddings[outlier_ids]

	# Reduce dimensionality with UMAP
	umap_fit = UMAP(n_components=2, n_neighbors=8, random_state=SEED)
	selected_outlier_subset_embeddings_umap = umap_fit.fit_transform(selected_outlier_subset_embeddings)

	# Set plot labels
	mismatched_labels = {"nineeleven": 0, "facetoface": 1, "letters": 2, "oup": 3, "verbatim": 4}
	matched_labels = {"fiction": 5, "government": 6, "slate": 7, "telephone": 8, "travel": 9}
	# Get embeddings of test examples whose outlier scores are below the threshold

	sorted_ids = test_outlier_scores.argsort()
	outlier_scores = test_outlier_scores[sorted_ids]
	outlier_ids = sorted_ids[outlier_scores < threshold]

	selected_outlier_subset = test_data.select(outlier_ids)
	selected_outlier_subset.to_pandas().tail(15)
	# Take the 2.5th percentile of the outlier scores in the training data as the threshold
	threshold = np.percentile(test_outlier_scores, 2.5)

	fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
	plt_range = [min(train_outlier_scores.min(),test_outlier_scores.min()), \
	max(train_outlier_scores.max(),test_outlier_scores.max())]

	axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
	axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency')
	axes[0].axvline(x=threshold, color='red', linewidth=2)
	# Visualize 15 least severe outliers in test data
	bottom_outlier_idxs = (-test_outlier_scores).argsort()[:20]
	bottom_outlier_subset = test_data.select(bottom_outlier_idxs)
	bottom_outlier_subset.to_pandas()