Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Last active December 5, 2023 07:18
Show Gist options
  • Save thistleknot/f11104df9ae3b8a33327f4145db860b9 to your computer and use it in GitHub Desktop.
Save thistleknot/f11104df9ae3b8a33327f4145db860b9 to your computer and use it in GitHub Desktop.
tensorboard with ema embeddings
#!/usr/bin/env python
# coding: utf-8
import lade
from transformers import AutoTokenizer, AutoModel
import torch
lade.augment_all()
lade.config_lade(LEVEL=5, WINDOW_SIZE=7, GUESS_SET_SIZE=7, DEBUG=0)
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
#import tensorflow as tf
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from torch.utils.tensorboard import SummaryWriter
import torch
from tqdm import tqdm
import numpy as np
# Load a pre-trained Sentence Transformer model
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import silhouette_score
#import umap
import umap
from sklearn.cluster import KMeans
from itertools import product, combinations
import random
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import umap
import random
import dask
from dask.distributed import Client, as_completed
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances# Step 1: Map clusters to sentences
import os
import pandas as pd
import tensorflow as tf
from tensorboard.plugins import projector
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import nltk
#nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import re
sample_size = 1000
batch_size = 500
sequence_length = 128
stride_length = 64
def filter_out_pos_tags_stopwords_contractions(texts, disallowed_postags=['IN', 'DT', 'CC', 'TO']):
additional_words_to_remove = ['we', 'be', 'you']
filtered_texts = []
for text in texts:
doc = nltk.pos_tag(nltk.word_tokenize(text))
filtered_words = []
for word, tag in doc:
if tag not in disallowed_postags and word.lower() not in stop_words and "'" not in word and word.lower() not in additional_words_to_remove:
filtered_words.append(word)
filtered_texts.append(" ".join(filtered_words))
return filtered_texts
#improved version that does batching
def calculate_sentence_statistics_batched(model, tokenizer, texts, batch_size=32):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Map each text to its strides
text_to_strides = {}
all_strides = []
for text in texts:
tokens = tokenizer.encode(text)
text_strides = []
for i in range(0, len(tokens), stride_length):
sequence_tokens = tokens[i:i + sequence_length]
if len(sequence_tokens) < sequence_length:
sequence_tokens += [tokenizer.pad_token_id] * (sequence_length - len(sequence_tokens))
stride = tokenizer.decode(sequence_tokens)
text_strides.append(stride)
all_strides.append(stride)
text_to_strides[text] = text_strides
# Process strides in batches and collect embeddings
embeddings = []
for i in tqdm(range(0, len(all_strides), batch_size), position=0, leave=True):
batch_strides = all_strides[i:i + batch_size]
encoded_input = tokenizer(batch_strides, padding=True, truncation=True, max_length=sequence_length, return_tensors='pt').to(device)
with torch.no_grad():
model_output = model(**encoded_input)
batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
attention_masks = encoded_input['attention_mask'].cpu().numpy()
embeddings.extend(zip(batch_embeddings, attention_masks))
# Associate embeddings back to texts
offset = 0
sentence_embeddings_dict = {}
for text, strides in text_to_strides.items():
stride_embeddings_with_masks = embeddings[offset:offset + len(strides)]
offset += len(strides)
if stride_embeddings_with_masks:
ema_embedding, _ = stride_embeddings_with_masks[0]
alpha = 2.0 / (len(stride_embeddings_with_masks) + 1)
for i, (embedding, mask) in enumerate(stride_embeddings_with_masks[1:]):
if i == len(stride_embeddings_with_masks) - 2: # Check if it's the last stride
weight = mask.sum() / sequence_length
adjusted_alpha = alpha * weight
else:
adjusted_alpha = alpha
ema_embedding = adjusted_alpha * embedding + (1 - adjusted_alpha) * ema_embedding
sentence_embeddings_dict[text] = ema_embedding
return sentence_embeddings_dict
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
def apply_umap(embeddings, n_neighbors, min_dist):
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist)
reduced_embeddings = reducer.fit_transform(embeddings)
return reduced_embeddings
@dask.delayed
def process_single_combination(embeddings, params):
n_neighbors, min_dist, n_clusters = params
reduced_embeddings = apply_umap(embeddings, n_neighbors, min_dist)
#bss_tss_ratio = calculate_bss_tss_ratio(reduced_embeddings, n_clusters)
bss_tss_ratio = calculate_bss_tss_ratio_approx(reduced_embeddings, n_clusters)
return bss_tss_ratio, params
def calculate_bss_tss_ratio_approx(embeddings, n_clusters, batch_size=batch_size):
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42, init='k-means++',n_init='auto')
labels = kmeans.fit_predict(embeddings)
# Calculate the Total Sum of Squares
centroid = np.mean(embeddings, axis=0)
tss = np.sum((embeddings - centroid) ** 2)
# Calculate the Between Cluster Sum of Squares
bss = sum(np.sum((embeddings[labels == i] - kmeans.cluster_centers_[i]) ** 2) for i in range(n_clusters))
# Ratio of BSS to TSS
bss_tss_ratio = bss / tss if tss > 0 else 0
return bss_tss_ratio
def cross_validation(embeddings, parameter_combinations, outer_k=5):
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")
outer_cv = KFold(n_splits=outer_k, shuffle=True, random_state=42)
best_score = -1
best_params = None
delayed_results = []
for train_idx, _ in outer_cv.split(embeddings):
train_embeddings = [embeddings[i] for i in train_idx]
for params in parameter_combinations:
# Create a delayed task for each parameter combination
delayed_result = process_single_combination(train_embeddings, params)
delayed_results.append(delayed_result)
# Compute all tasks in parallel
results = dask.compute(*delayed_results)
# Find the best parameter combination
for score, params in results:
if score > best_score:
best_score = score
best_params = params
return best_params, best_score
def find_representative_labels(kmeans_model, labels, embeddings):
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")
centroids = kmeans_model.cluster_centers_
# Find the closest point to each centroid
closest, _ = pairwise_distances_argmin_min(centroids, embeddings)
# Map these points to their corresponding labels
representative_labels = [labels[idx] for idx in closest]
return representative_labels
# Function to update the scatterplot based on the selected cluster
def update_scatterplot(selected_cluster):
plt.figure(figsize=(8, 6))
plt.scatter(cluster_df.iloc[:, 0], cluster_df.iloc[:, 1], c=cluster_df['Cluster'], cmap='viridis')
plt.xlabel('Column 0')
plt.ylabel('Column 1')
plt.colorbar(label='Cluster')
plt.title(f'Scatterplot with Cluster {selected_cluster} Highlighted', fontsize=14)
# Highlight the selected cluster
selected_cluster_indices = cluster_df.loc[cluster_df['Cluster'] == selected_cluster]
plt.scatter(selected_cluster_indices.iloc[:, 0], selected_cluster_indices.iloc[:, 1], c='red', label=f'Cluster {selected_cluster}')
plt.legend()
plt.show()
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
dataset = load_dataset("Abirate/english_quotes")
quotes = [item['quote'] for item in dataset['train']]
sentence_statistics = calculate_sentence_statistics_batched(model, tokenizer, quotes)
embeddings = [sentence_statistics[k] for k in list(sentence_statistics.keys())]
metadata_labels = sentence_statistics.keys()
# Convert to tensor
embeddings_tensor = torch.tensor(embeddings)
# Optionally create metadata labels for each sentence
metadata_labels = list(sentence_statistics.keys())
# TensorBoard Logging
# Ensure you're using the reshaped tensor
reshaped_embeddings = embeddings_tensor.squeeze(1)
# If adding embeddings multiple times, use a unique global_step each time
# For example:
writer = SummaryWriter('./')
global_step = 0 # Increment this appropriately in your actual code
# Use the reshaped tensor in the call
writer.add_embedding(reshaped_embeddings, metadata=metadata_labels, global_step=global_step)
# Close the writer when done
writer.close()
#run in shell
#!tensorboard --logdir=./ --host 0.0.0.0
labels_embeddings = dict(zip(metadata_labels,embeddings_tensor))
random_sample = dict(random.sample(labels_embeddings.items(), sample_size))
labels_embeddings_sample = [[key, random_sample[key]] for key in random_sample.keys()]
dict_labels_embeddings_sample = {}
for e in labels_embeddings_sample:
# Use the update method to add key-value pairs to the new dictionary
dict_labels_embeddings_sample.update({e[0]: e[1]})
embeddings_sample = [dict_labels_embeddings_sample[k] for k in list(dict_labels_embeddings_sample.keys())]
labels_sample = [k for k in list(dict_labels_embeddings_sample.keys())]
# Assuming embeddings_sample is a list of PyTorch tensors
embeddings_sample_numpy = [e.numpy() for e in embeddings_sample]
# Now convert the list of NumPy arrays to a single NumPy array
embeddings_array = np.array(embeddings_sample_numpy)
# Define the hyperparameter space
if(False):
n_neighbors_values = [5, 8, 13, 21, 34, 55, 89]
min_dist_values = [.05, .08, .13, .21, .34, .55, .89]
n_clusters_values = [5, 8, 13, 21, 34, 55, 89]
else:
n_neighbors_values = [8, 13, 21, 34, 55]
min_dist_values = [.13, .21, .34, .55, .89]
n_clusters_values = [13, 21, 34, 55, 89]
# Generate the parameter grid
parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values))
# Randomly sample 25 combinations
# Generate the parameter grid
parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values))
# Randomly sample 25 combinations
sampled_combinations = random.sample(parameter_grid, 125)
# Initialize a Dask client
client = Client(n_workers=8)
# Now use 'reshaped_embeddings' in your nested cross-validation function
best_params, best_score = cross_validation(embeddings_array, sampled_combinations)
client.close()
# In[261]:
best_params
# Function to filter out specific POS tags and stop words
def filter_out_pos_tags_and_stopwords(texts, disallowed_postags=['IN', 'CD', 'DT', 'EX', 'CC', 'TO','WP','RP','PRP$','PRP']):
filtered_texts = []
for text in texts:
doc = nltk.pos_tag(nltk.word_tokenize(text))
filtered_words = []
for word, tag in doc:
if tag not in disallowed_postags and word.lower() not in stop_words:
filtered_words.append(word)
filtered_texts.append(" ".join(filtered_words))
return filtered_texts
# Define a function to extract a single text span for each representative label
def extract_single_text_span(representative_labels, top_words, span_length=20):
extracted_spans = [] # To store the extracted text spans
for i, top_word_list in enumerate(top_words):
label = representative_labels[i] # Get the representative label
# Initialize variables to store the positions of top words
word_positions = {}
# Find the position of each top word in the representative label
for top_word in top_word_list:
position = label.find(top_word)
if position != -1:
word_positions[top_word] = position
# Check if any top words were found in the label
if word_positions:
# Get the start and end positions to create the span
start = min(word_positions.values()) - span_length // 2
end = max(word_positions.values()) + len(max(word_positions, key=len)) + span_length // 2
# Ensure the start and end positions are within bounds
start = max(0, start)
end = min(len(label), end)
# Extract the text span
span = label[start:end]
extracted_spans.append(span)
return extracted_spans
def extract_center_text(representative_labels, top_words, span_length=30):
extracted_centers = [] # To store the extracted center text
for i, top_word_list in enumerate(top_words):
label = representative_labels[i] # Get the representative label
# Initialize variables to store the positions of top words
word_positions = {}
# Find the position of each top word in the representative label
for top_word in top_word_list:
position = label.find(top_word)
if position != -1:
word_positions[top_word] = position
# Check if any top words were found in the label
if word_positions:
# Get the center position
center_position = sum(word_positions.values()) // len(word_positions)
# Get the start and end positions to create the span
start = max(0, center_position - span_length // 2)
end = min(len(label), center_position + span_length // 2)
# Extract the center text
center_text = label[start:end]
extracted_centers.append(center_text)
return extracted_centers
def extract_top_terms_per_cluster(df, num_terms=7):
"""
Extract top terms per cluster using TF-IDF.
:param df: DataFrame with 'sentence' and 'cluster' columns
:param num_terms: Number of top terms to extract for each cluster
:return: Dictionary with clusters as keys and top terms as values
"""
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['sentence'])
feature_names = vectorizer.get_feature_names_out()
top_terms_per_cluster = {}
for cluster in df['cluster'].unique():
# Subset by partition (cluster)
cluster_rows = np.where(df['cluster'] == cluster)[0]
# Aggregate TF-IDF scores for each term within the cluster
aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1
top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1]
top_terms = [feature_names[i] for i in top_indices]
top_terms_per_cluster[cluster] = ' '.join(top_terms)
return top_terms_per_cluster
# In[262]:
reducer = umap.UMAP(n_neighbors=best_params[0], min_dist=best_params[1])
reducer.fit(np.array(reshaped_embeddings))
umap_embeddings = reducer.transform(np.array(reshaped_embeddings))
labels_umap_embeddings = dict(zip(list(labels_embeddings.keys()),umap_embeddings))
kmeans_model = KMeans(n_clusters=best_params[2], random_state=42)
kmeans_model.fit(umap_embeddings)
predicted_clusters = kmeans_model.fit_predict(umap_embeddings)
# Assume kmeans_model is your trained KMeans model
representative_labels = find_representative_labels(kmeans_model, list(labels_umap_embeddings.keys()), embeddings = np.array(list(labels_umap_embeddings.values())))
# Create a mapping from the original labels to the UMAP embeddings
labels_umap_embeddings_full = dict(zip(labels_embeddings.keys(), umap_embeddings))
# Combine labels for each centroid into a single string
# Step 2: Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(representative_labels)
# Adding sklearn's default list of stopwords to your vectorizer
stop_words = list(text.ENGLISH_STOP_WORDS)
# Filter the text
filtered_labels = filter_out_pos_tags_stopwords_contractions(representative_labels)
top_terms = []
if(True):
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(filtered_labels)
top_terms = []
feature_names = vectorizer.get_feature_names_out()
#for i in range(0,len(tfidf_matrix)
for i in range(0,len(representative_labels)):
items = list(set(representative_labels[i].split()))
d = pd.DataFrame(tfidf_matrix.getrow(i).toarray(),columns=feature_names)
selected = d[d!=0.0].dropna(axis=1).T.reset_index().sort_values(by=0,ascending=False).head(7)['index'].values
matches = []
for s in selected:
for i in items:
if(re.match(r"\b"+s.lower()+r"\b",i.lower())):
matches.append(s)
string = list(dict.fromkeys(matches))
top_terms.append(' '.join(string))
# Call the function to extract single text spans
#single_text_spans = extract_single_text_span(representative_labels, top_terms)
# Call the function to extract the center 15 characters for each representative label
#center_text_spans = extract_center_text(representative_labels, top_terms)
# Now representative_labels_full holds the labels that are representative for each cluster
# Plotting clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=predicted_clusters, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6, label=[representative_labels[cluster] for cluster in clusters])
# Plotting centroids
centroids = kmeans_model.cluster_centers_
cluster_representative_dict = dict(zip(np.unique(predicted_clusters),representative_labels))
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x')
# Annotate Centroids in the plot
for i, centroid in enumerate(centroids):
label = top_terms[i]
plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='red')
# Title and labels
plt.title('2D visualization of KMeans clustering with representative labels')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
# Creating a legend
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()
# Calculate the overall mean of the data
overall_mean = np.mean(umap_embeddings, axis=0)
# Calculate Total Sum of Squares (TSS)
tss = np.sum((umap_embeddings - overall_mean) ** 2)
# Calculate Between Sum of Squares (BSS)
bss = sum(np.sum((umap_embeddings[predicted_clusters == i] - centroids[i]) ** 2) for i in range(best_params[2]))
print("Total Sum of Squares (TSS):", tss)
print("Between Sum of Squares (BSS):", bss)
centroids = kmeans_model.cluster_centers_
dict_centroids = dict(zip(np.unique(predicted_clusters),centroids))
dict_terms = dict(zip(np.unique(predicted_clusters),top_terms))
cluster_df = pd.concat([
pd.DataFrame(list(labels_umap_embeddings_full.keys())),
pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y']),
pd.DataFrame(predicted_clusters),
pd.DataFrame([cluster_representative_dict[c] for c in predicted_clusters]),
pd.DataFrame([dict_terms[c] for c in predicted_clusters]),
pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']),
(pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y'])-pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']).rename(columns={'C x': 'x','C y': 'y'})).rename(columns={'x': 'D x','y': 'D y'})
],axis=1)
cluster_df.columns = ['sentence','x','y','cluster','cluster rep','tfidf','C x', 'C y','D x', 'D y']
cluster_df['delta center'] = np.sqrt((cluster_df['D x'].multiply(cluster_df['D x'])+cluster_df['D y'].multiply(cluster_df['D y']))/2)
print(cluster_df.groupby('cluster')['sentence'].count().sort_values(ascending=False))
cluster_df = cluster_df.sort_values(by=['cluster', 'delta center'])
cluster_df.to_csv('clustered_sentences.csv')
# In[264]:
from IPython.display import display, clear_output
# Create widgets
cluster_dropdown = widgets.Dropdown(
options=[(str(cluster), cluster) for cluster in np.sort(cluster_df['cluster'].unique())],
description='Select Cluster:',
)
# Create widgets
head_dropdown = widgets.Dropdown(
value=5,
options=[1,2,3,5,8,13],
description='Select Head filter:',
)
# Create widgets
num_terms_dropdown = widgets.Dropdown(
value=5,
options=[1,2,3,5,8,13],
description='Select terms:',
)
delta_center_slider = widgets.Dropdown(
value=.5,
options=[.1,.2,.3,.5,.8,1.3],
description='Max Distance:',
)
output = widgets.Output()
def update_scatterplot(selected_cluster, head_filter, max_distance, num_terms):
def extract_top_terms_per_cluster(df, num_terms=5):
"""
Extract top terms per cluster using TF-IDF.
:param df: DataFrame with 'sentence' and 'cluster' columns
:param num_terms: Number of top terms to extract for each cluster
:return: Dictionary with clusters as keys and top terms as values
"""
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['sentence'])
feature_names = vectorizer.get_feature_names_out()
top_terms_per_cluster = {}
for cluster in df['cluster'].unique():
# Subset by partition (cluster)
cluster_rows = np.where(df['cluster'] == cluster)[0]
# Aggregate TF-IDF scores for each term within the cluster
aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1
top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1]
top_terms = [feature_names[i] for i in top_indices]
top_terms_per_cluster[cluster] = ' '.join(top_terms)
return top_terms_per_cluster
with output:
clear_output(wait=True)
# Filtering based on the slider value
filtered_df = cluster_df[cluster_df['delta center'] <= max_distance]
# Plotting all clusters with filtered points
plt.figure(figsize=(10, 6))
plt.scatter(filtered_df['x'], filtered_df['y'], c=filtered_df['cluster'], cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6)
# Plotting centroids
centroids = kmeans_model.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x')
# Example usage
#filtered_df = cluster_df.query('`delta center` < .50').sort_values(by=['cluster', 'delta center']).groupby('cluster')
top_terms = extract_top_terms_per_cluster(filtered_df.groupby('cluster').head(head_filter),num_terms=num_terms)
top_terms = list(top_terms.values())
#print(top_terms)
# Annotate Centroids
for i, centroid in enumerate(centroids):
label = top_terms[i]
plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='black')
# Highlighting the selected cluster
selected_cluster_df = filtered_df[filtered_df['cluster'] == selected_cluster]
plt.scatter(selected_cluster_df['x'], selected_cluster_df['y'], c='white', s=100, marker='x')
# Title and labels
plt.title('2D Visualization of KMeans Clustering with Representative Labels')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()
# Display the filtered and sorted DataFrame
display_df = filtered_df.groupby('cluster').head(head_filter).sort_values(by=['cluster', 'delta center'])#.groupby('cluster')#.head(5)
display(display_df)
# Interactive widget
interactive_plot = widgets.interactive(update_scatterplot, selected_cluster=cluster_dropdown, head_filter=head_dropdown, num_terms=num_terms_dropdown, max_distance=delta_center_slider)
# Display the interactive components
display(interactive_plot, output)
# In[258]:
# Step 1: Export embeddings and metadata for TensorBoard
# Embeddings: reshaped_embeddings_full_array
# Metadata: sentences and cluster IDs
# Prepare the metadata file with headers, including the "Representative" column
metadata_df = pd.DataFrame({
'Sentence': cluster_df['sentence'].str.replace('\t', ' '),
'Cluster': cluster_df['cluster'],
'Representative': cluster_df['cluster rep']
})
#modify projector_config.pbtxt
metadata_file = "/data/sub-sentence-encoder/00000/default/metadata.tsv"
metadata_df.to_csv(metadata_file, sep='\t', index=False, header=True)
# Save the embeddings to a TSV file
embeddings_file = "/data/sub-sentence-encoder/00000/default/embeddings.tsv"
with open(embeddings_file, 'w') as f:
for emb in embeddings:
f.write('\t'.join(map(str, emb)) + '\n')
embeddings_file = "/data/sub-sentence-encoder/00000/default/umap_embeddings.tsv"
with open(embeddings_file, 'w') as f:
for emb in umap_embeddings:
f.write('\t'.join(map(str, emb)) + '\n')
# Step 2: Set up TensorBoard projector (this step is usually run in the same directory as the TSV files)
# Create a log directory for TensorBoard
log_dir = "/data/sub-sentence-encoder/logs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# Create a projector config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'umap_embeddings'
embedding.metadata_path = metadata_file
# Save the projector config
projector.visualize_embeddings(log_dir, config)
metadata = pd.read_csv("/data/sub-sentence-encoder/00000/default/metadata.tsv",delimiter='\t')
cluster_df.columns
try:
cluster_df.drop(columns=0,inplace=True)
except:
pass
cluster_df = pd.concat([cluster_df.sort_index(),pd.DataFrame([e] for e in embeddings)],axis=1)
representative_labels_full = dict(zip(np.unique(clusters),representative_labels))
# Select representative embeddings
representative_embeddings = cluster_df.query('sentence in @representative_labels')[[0,'sentence','cluster']]
#representative_embeddings = cluster_df.query('sentence in @representative_labels')[['x','y','sentence',0]]
#list_of_2d_arrays = [np.array([row['x'], row['y']]) for index, row in filtered_cluster_df.iterrows()]
combined_embeddings = []
for (emb1, emb2) in combinations(representative_embeddings[0], 2):
combined_embeddings.append(emb1 + emb2) # Simple addition, modify as needed
# Perform vector arithmetic (e.g., pairwise addition)
# Convert PyTorch tensors to NumPy arrays and ensure uniform shape
combined_embeddings_np = [emb.numpy() if isinstance(emb, torch.Tensor) else emb for emb in combined_embeddings]
# Check if all embeddings have the same shape
embedding_shapes = [emb.shape for emb in combined_embeddings_np]
if len(set(embedding_shapes)) != 1:
raise ValueError("Not all embeddings have the same shape.")
# Now transform with UMAP
umap_combined_embeddings = reducer.transform(np.array(combined_embeddings_np))
predicted_clusters_for_combined = kmeans_model.predict(umap_combined_embeddings)
# Find the closest representatives for each combined embedding, along with distances
closest_representatives_for_combined = []
distances_to_centroid = [] # Store distances here
for idx, emb in enumerate(umap_combined_embeddings):
cluster = predicted_clusters_for_combined[idx] # Get the cluster of the combined embedding
centroid = kmeans_model.cluster_centers_[cluster]
distance = pairwise_distances([centroid], [emb])[0][0] # Calculate distance to centroid
representative_label = representative_labels_full[cluster]
closest_representatives_for_combined.append(representative_label)
distances_to_centroid.append(distance) # Append the calculated distance
for label, distance in zip(closest_representatives_for_combined, distances_to_centroid):
print(f"Representative Sentence: {label}, Distance from neighbor: {distance}")
# In[251]:
# In[255]:
pwd
# In[254]:
np.unique([*np.unique(closest_representatives_for_combined),*np.unique(cluster_df['cluster rep'])])
# In[260]:
#cat projector_config.pbtxt
"""
embeddings {
tensor_name: "default:00000"
metadata_path: "00000/default/metadata.tsv"
tensor_path: "00000/default/tensors.tsv"
tensor_name: "default:00001"
metadata_path: "00000/default/metadata.tsv"
tensor_path: "00000/default/embeddings.tsv"
}
"""
#tensorboard.sh
"""
tensorboard --logdir=./ --host 0.0.0.0
"""
@thistleknot
Copy link
Author

thistleknot commented Nov 26, 2023

select, edit label of selection, download
reload tensoboard, select, don't label, download

compare two downloads side by side in excel with filtering one of the columns for the label you set.

and use as an example of what kind of inferences you can get from a RAG + LLM setup

tensorflow/tensorboard#820 (comment)

@thistleknot
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment