Last active
December 5, 2023 07:18
-
-
Save thistleknot/f11104df9ae3b8a33327f4145db860b9 to your computer and use it in GitHub Desktop.
tensorboard with ema embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import lade | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
lade.augment_all() | |
lade.config_lade(LEVEL=5, WINDOW_SIZE=7, GUESS_SET_SIZE=7, DEBUG=0) | |
from datasets import load_dataset | |
from torch.utils.tensorboard import SummaryWriter | |
#import tensorflow as tf | |
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM | |
from sentence_transformers import SentenceTransformer | |
from torch.utils.tensorboard import SummaryWriter | |
import torch | |
from tqdm import tqdm | |
import numpy as np | |
# Load a pre-trained Sentence Transformer model | |
import warnings | |
from sklearn.model_selection import KFold | |
from sklearn.metrics import silhouette_score | |
#import umap | |
import umap | |
from sklearn.cluster import KMeans | |
from itertools import product, combinations | |
import random | |
from sklearn.model_selection import KFold | |
from sklearn.cluster import KMeans | |
import umap | |
import random | |
import dask | |
from dask.distributed import Client, as_completed | |
from sklearn.cluster import MiniBatchKMeans | |
from sklearn.metrics import pairwise_distances_argmin_min | |
from sklearn.metrics import pairwise_distances# Step 1: Map clusters to sentences | |
import os | |
import pandas as pd | |
import tensorflow as tf | |
from tensorboard.plugins import projector | |
import matplotlib.pyplot as plt | |
import ipywidgets as widgets | |
from IPython.display import display | |
import nltk | |
#nltk.download('averaged_perceptron_tagger') | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction import text | |
import re | |
sample_size = 1000 | |
batch_size = 500 | |
sequence_length = 128 | |
stride_length = 64 | |
def filter_out_pos_tags_stopwords_contractions(texts, disallowed_postags=['IN', 'DT', 'CC', 'TO']): | |
additional_words_to_remove = ['we', 'be', 'you'] | |
filtered_texts = [] | |
for text in texts: | |
doc = nltk.pos_tag(nltk.word_tokenize(text)) | |
filtered_words = [] | |
for word, tag in doc: | |
if tag not in disallowed_postags and word.lower() not in stop_words and "'" not in word and word.lower() not in additional_words_to_remove: | |
filtered_words.append(word) | |
filtered_texts.append(" ".join(filtered_words)) | |
return filtered_texts | |
#improved version that does batching | |
def calculate_sentence_statistics_batched(model, tokenizer, texts, batch_size=32): | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# Map each text to its strides | |
text_to_strides = {} | |
all_strides = [] | |
for text in texts: | |
tokens = tokenizer.encode(text) | |
text_strides = [] | |
for i in range(0, len(tokens), stride_length): | |
sequence_tokens = tokens[i:i + sequence_length] | |
if len(sequence_tokens) < sequence_length: | |
sequence_tokens += [tokenizer.pad_token_id] * (sequence_length - len(sequence_tokens)) | |
stride = tokenizer.decode(sequence_tokens) | |
text_strides.append(stride) | |
all_strides.append(stride) | |
text_to_strides[text] = text_strides | |
# Process strides in batches and collect embeddings | |
embeddings = [] | |
for i in tqdm(range(0, len(all_strides), batch_size), position=0, leave=True): | |
batch_strides = all_strides[i:i + batch_size] | |
encoded_input = tokenizer(batch_strides, padding=True, truncation=True, max_length=sequence_length, return_tensors='pt').to(device) | |
with torch.no_grad(): | |
model_output = model(**encoded_input) | |
batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy() | |
attention_masks = encoded_input['attention_mask'].cpu().numpy() | |
embeddings.extend(zip(batch_embeddings, attention_masks)) | |
# Associate embeddings back to texts | |
offset = 0 | |
sentence_embeddings_dict = {} | |
for text, strides in text_to_strides.items(): | |
stride_embeddings_with_masks = embeddings[offset:offset + len(strides)] | |
offset += len(strides) | |
if stride_embeddings_with_masks: | |
ema_embedding, _ = stride_embeddings_with_masks[0] | |
alpha = 2.0 / (len(stride_embeddings_with_masks) + 1) | |
for i, (embedding, mask) in enumerate(stride_embeddings_with_masks[1:]): | |
if i == len(stride_embeddings_with_masks) - 2: # Check if it's the last stride | |
weight = mask.sum() / sequence_length | |
adjusted_alpha = alpha * weight | |
else: | |
adjusted_alpha = alpha | |
ema_embedding = adjusted_alpha * embedding + (1 - adjusted_alpha) * ema_embedding | |
sentence_embeddings_dict[text] = ema_embedding | |
return sentence_embeddings_dict | |
#Mean Pooling - Take attention mask into account for correct averaging | |
def mean_pooling(model_output, attention_mask): | |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) | |
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
return sum_embeddings / sum_mask | |
def apply_umap(embeddings, n_neighbors, min_dist): | |
reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist) | |
reduced_embeddings = reducer.fit_transform(embeddings) | |
return reduced_embeddings | |
@dask.delayed | |
def process_single_combination(embeddings, params): | |
n_neighbors, min_dist, n_clusters = params | |
reduced_embeddings = apply_umap(embeddings, n_neighbors, min_dist) | |
#bss_tss_ratio = calculate_bss_tss_ratio(reduced_embeddings, n_clusters) | |
bss_tss_ratio = calculate_bss_tss_ratio_approx(reduced_embeddings, n_clusters) | |
return bss_tss_ratio, params | |
def calculate_bss_tss_ratio_approx(embeddings, n_clusters, batch_size=batch_size): | |
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42, init='k-means++',n_init='auto') | |
labels = kmeans.fit_predict(embeddings) | |
# Calculate the Total Sum of Squares | |
centroid = np.mean(embeddings, axis=0) | |
tss = np.sum((embeddings - centroid) ** 2) | |
# Calculate the Between Cluster Sum of Squares | |
bss = sum(np.sum((embeddings[labels == i] - kmeans.cluster_centers_[i]) ** 2) for i in range(n_clusters)) | |
# Ratio of BSS to TSS | |
bss_tss_ratio = bss / tss if tss > 0 else 0 | |
return bss_tss_ratio | |
def cross_validation(embeddings, parameter_combinations, outer_k=5): | |
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans") | |
outer_cv = KFold(n_splits=outer_k, shuffle=True, random_state=42) | |
best_score = -1 | |
best_params = None | |
delayed_results = [] | |
for train_idx, _ in outer_cv.split(embeddings): | |
train_embeddings = [embeddings[i] for i in train_idx] | |
for params in parameter_combinations: | |
# Create a delayed task for each parameter combination | |
delayed_result = process_single_combination(train_embeddings, params) | |
delayed_results.append(delayed_result) | |
# Compute all tasks in parallel | |
results = dask.compute(*delayed_results) | |
# Find the best parameter combination | |
for score, params in results: | |
if score > best_score: | |
best_score = score | |
best_params = params | |
return best_params, best_score | |
def find_representative_labels(kmeans_model, labels, embeddings): | |
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans") | |
centroids = kmeans_model.cluster_centers_ | |
# Find the closest point to each centroid | |
closest, _ = pairwise_distances_argmin_min(centroids, embeddings) | |
# Map these points to their corresponding labels | |
representative_labels = [labels[idx] for idx in closest] | |
return representative_labels | |
# Function to update the scatterplot based on the selected cluster | |
def update_scatterplot(selected_cluster): | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(cluster_df.iloc[:, 0], cluster_df.iloc[:, 1], c=cluster_df['Cluster'], cmap='viridis') | |
plt.xlabel('Column 0') | |
plt.ylabel('Column 1') | |
plt.colorbar(label='Cluster') | |
plt.title(f'Scatterplot with Cluster {selected_cluster} Highlighted', fontsize=14) | |
# Highlight the selected cluster | |
selected_cluster_indices = cluster_df.loc[cluster_df['Cluster'] == selected_cluster] | |
plt.scatter(selected_cluster_indices.iloc[:, 0], selected_cluster_indices.iloc[:, 1], c='red', label=f'Cluster {selected_cluster}') | |
plt.legend() | |
plt.show() | |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") | |
model = model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") | |
dataset = load_dataset("Abirate/english_quotes") | |
quotes = [item['quote'] for item in dataset['train']] | |
sentence_statistics = calculate_sentence_statistics_batched(model, tokenizer, quotes) | |
embeddings = [sentence_statistics[k] for k in list(sentence_statistics.keys())] | |
metadata_labels = sentence_statistics.keys() | |
# Convert to tensor | |
embeddings_tensor = torch.tensor(embeddings) | |
# Optionally create metadata labels for each sentence | |
metadata_labels = list(sentence_statistics.keys()) | |
# TensorBoard Logging | |
# Ensure you're using the reshaped tensor | |
reshaped_embeddings = embeddings_tensor.squeeze(1) | |
# If adding embeddings multiple times, use a unique global_step each time | |
# For example: | |
writer = SummaryWriter('./') | |
global_step = 0 # Increment this appropriately in your actual code | |
# Use the reshaped tensor in the call | |
writer.add_embedding(reshaped_embeddings, metadata=metadata_labels, global_step=global_step) | |
# Close the writer when done | |
writer.close() | |
#run in shell | |
#!tensorboard --logdir=./ --host 0.0.0.0 | |
labels_embeddings = dict(zip(metadata_labels,embeddings_tensor)) | |
random_sample = dict(random.sample(labels_embeddings.items(), sample_size)) | |
labels_embeddings_sample = [[key, random_sample[key]] for key in random_sample.keys()] | |
dict_labels_embeddings_sample = {} | |
for e in labels_embeddings_sample: | |
# Use the update method to add key-value pairs to the new dictionary | |
dict_labels_embeddings_sample.update({e[0]: e[1]}) | |
embeddings_sample = [dict_labels_embeddings_sample[k] for k in list(dict_labels_embeddings_sample.keys())] | |
labels_sample = [k for k in list(dict_labels_embeddings_sample.keys())] | |
# Assuming embeddings_sample is a list of PyTorch tensors | |
embeddings_sample_numpy = [e.numpy() for e in embeddings_sample] | |
# Now convert the list of NumPy arrays to a single NumPy array | |
embeddings_array = np.array(embeddings_sample_numpy) | |
# Define the hyperparameter space | |
if(False): | |
n_neighbors_values = [5, 8, 13, 21, 34, 55, 89] | |
min_dist_values = [.05, .08, .13, .21, .34, .55, .89] | |
n_clusters_values = [5, 8, 13, 21, 34, 55, 89] | |
else: | |
n_neighbors_values = [8, 13, 21, 34, 55] | |
min_dist_values = [.13, .21, .34, .55, .89] | |
n_clusters_values = [13, 21, 34, 55, 89] | |
# Generate the parameter grid | |
parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values)) | |
# Randomly sample 25 combinations | |
# Generate the parameter grid | |
parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values)) | |
# Randomly sample 25 combinations | |
sampled_combinations = random.sample(parameter_grid, 125) | |
# Initialize a Dask client | |
client = Client(n_workers=8) | |
# Now use 'reshaped_embeddings' in your nested cross-validation function | |
best_params, best_score = cross_validation(embeddings_array, sampled_combinations) | |
client.close() | |
# In[261]: | |
best_params | |
# Function to filter out specific POS tags and stop words | |
def filter_out_pos_tags_and_stopwords(texts, disallowed_postags=['IN', 'CD', 'DT', 'EX', 'CC', 'TO','WP','RP','PRP$','PRP']): | |
filtered_texts = [] | |
for text in texts: | |
doc = nltk.pos_tag(nltk.word_tokenize(text)) | |
filtered_words = [] | |
for word, tag in doc: | |
if tag not in disallowed_postags and word.lower() not in stop_words: | |
filtered_words.append(word) | |
filtered_texts.append(" ".join(filtered_words)) | |
return filtered_texts | |
# Define a function to extract a single text span for each representative label | |
def extract_single_text_span(representative_labels, top_words, span_length=20): | |
extracted_spans = [] # To store the extracted text spans | |
for i, top_word_list in enumerate(top_words): | |
label = representative_labels[i] # Get the representative label | |
# Initialize variables to store the positions of top words | |
word_positions = {} | |
# Find the position of each top word in the representative label | |
for top_word in top_word_list: | |
position = label.find(top_word) | |
if position != -1: | |
word_positions[top_word] = position | |
# Check if any top words were found in the label | |
if word_positions: | |
# Get the start and end positions to create the span | |
start = min(word_positions.values()) - span_length // 2 | |
end = max(word_positions.values()) + len(max(word_positions, key=len)) + span_length // 2 | |
# Ensure the start and end positions are within bounds | |
start = max(0, start) | |
end = min(len(label), end) | |
# Extract the text span | |
span = label[start:end] | |
extracted_spans.append(span) | |
return extracted_spans | |
def extract_center_text(representative_labels, top_words, span_length=30): | |
extracted_centers = [] # To store the extracted center text | |
for i, top_word_list in enumerate(top_words): | |
label = representative_labels[i] # Get the representative label | |
# Initialize variables to store the positions of top words | |
word_positions = {} | |
# Find the position of each top word in the representative label | |
for top_word in top_word_list: | |
position = label.find(top_word) | |
if position != -1: | |
word_positions[top_word] = position | |
# Check if any top words were found in the label | |
if word_positions: | |
# Get the center position | |
center_position = sum(word_positions.values()) // len(word_positions) | |
# Get the start and end positions to create the span | |
start = max(0, center_position - span_length // 2) | |
end = min(len(label), center_position + span_length // 2) | |
# Extract the center text | |
center_text = label[start:end] | |
extracted_centers.append(center_text) | |
return extracted_centers | |
def extract_top_terms_per_cluster(df, num_terms=7): | |
""" | |
Extract top terms per cluster using TF-IDF. | |
:param df: DataFrame with 'sentence' and 'cluster' columns | |
:param num_terms: Number of top terms to extract for each cluster | |
:return: Dictionary with clusters as keys and top terms as values | |
""" | |
# Initialize the TF-IDF Vectorizer | |
vectorizer = TfidfVectorizer(stop_words='english') | |
tfidf_matrix = vectorizer.fit_transform(df['sentence']) | |
feature_names = vectorizer.get_feature_names_out() | |
top_terms_per_cluster = {} | |
for cluster in df['cluster'].unique(): | |
# Subset by partition (cluster) | |
cluster_rows = np.where(df['cluster'] == cluster)[0] | |
# Aggregate TF-IDF scores for each term within the cluster | |
aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1 | |
top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1] | |
top_terms = [feature_names[i] for i in top_indices] | |
top_terms_per_cluster[cluster] = ' '.join(top_terms) | |
return top_terms_per_cluster | |
# In[262]: | |
reducer = umap.UMAP(n_neighbors=best_params[0], min_dist=best_params[1]) | |
reducer.fit(np.array(reshaped_embeddings)) | |
umap_embeddings = reducer.transform(np.array(reshaped_embeddings)) | |
labels_umap_embeddings = dict(zip(list(labels_embeddings.keys()),umap_embeddings)) | |
kmeans_model = KMeans(n_clusters=best_params[2], random_state=42) | |
kmeans_model.fit(umap_embeddings) | |
predicted_clusters = kmeans_model.fit_predict(umap_embeddings) | |
# Assume kmeans_model is your trained KMeans model | |
representative_labels = find_representative_labels(kmeans_model, list(labels_umap_embeddings.keys()), embeddings = np.array(list(labels_umap_embeddings.values()))) | |
# Create a mapping from the original labels to the UMAP embeddings | |
labels_umap_embeddings_full = dict(zip(labels_embeddings.keys(), umap_embeddings)) | |
# Combine labels for each centroid into a single string | |
# Step 2: Apply TF-IDF Vectorization | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(representative_labels) | |
# Adding sklearn's default list of stopwords to your vectorizer | |
stop_words = list(text.ENGLISH_STOP_WORDS) | |
# Filter the text | |
filtered_labels = filter_out_pos_tags_stopwords_contractions(representative_labels) | |
top_terms = [] | |
if(True): | |
# Initialize the TF-IDF Vectorizer | |
vectorizer = TfidfVectorizer(stop_words=stop_words) | |
tfidf_matrix = vectorizer.fit_transform(filtered_labels) | |
top_terms = [] | |
feature_names = vectorizer.get_feature_names_out() | |
#for i in range(0,len(tfidf_matrix) | |
for i in range(0,len(representative_labels)): | |
items = list(set(representative_labels[i].split())) | |
d = pd.DataFrame(tfidf_matrix.getrow(i).toarray(),columns=feature_names) | |
selected = d[d!=0.0].dropna(axis=1).T.reset_index().sort_values(by=0,ascending=False).head(7)['index'].values | |
matches = [] | |
for s in selected: | |
for i in items: | |
if(re.match(r"\b"+s.lower()+r"\b",i.lower())): | |
matches.append(s) | |
string = list(dict.fromkeys(matches)) | |
top_terms.append(' '.join(string)) | |
# Call the function to extract single text spans | |
#single_text_spans = extract_single_text_span(representative_labels, top_terms) | |
# Call the function to extract the center 15 characters for each representative label | |
#center_text_spans = extract_center_text(representative_labels, top_terms) | |
# Now representative_labels_full holds the labels that are representative for each cluster | |
# Plotting clusters | |
plt.figure(figsize=(10, 6)) | |
scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=predicted_clusters, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6, label=[representative_labels[cluster] for cluster in clusters]) | |
# Plotting centroids | |
centroids = kmeans_model.cluster_centers_ | |
cluster_representative_dict = dict(zip(np.unique(predicted_clusters),representative_labels)) | |
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x') | |
# Annotate Centroids in the plot | |
for i, centroid in enumerate(centroids): | |
label = top_terms[i] | |
plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='red') | |
# Title and labels | |
plt.title('2D visualization of KMeans clustering with representative labels') | |
plt.xlabel('UMAP Dimension 1') | |
plt.ylabel('UMAP Dimension 2') | |
# Creating a legend | |
plt.legend(*scatter.legend_elements(), title="Clusters") | |
plt.show() | |
# Calculate the overall mean of the data | |
overall_mean = np.mean(umap_embeddings, axis=0) | |
# Calculate Total Sum of Squares (TSS) | |
tss = np.sum((umap_embeddings - overall_mean) ** 2) | |
# Calculate Between Sum of Squares (BSS) | |
bss = sum(np.sum((umap_embeddings[predicted_clusters == i] - centroids[i]) ** 2) for i in range(best_params[2])) | |
print("Total Sum of Squares (TSS):", tss) | |
print("Between Sum of Squares (BSS):", bss) | |
centroids = kmeans_model.cluster_centers_ | |
dict_centroids = dict(zip(np.unique(predicted_clusters),centroids)) | |
dict_terms = dict(zip(np.unique(predicted_clusters),top_terms)) | |
cluster_df = pd.concat([ | |
pd.DataFrame(list(labels_umap_embeddings_full.keys())), | |
pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y']), | |
pd.DataFrame(predicted_clusters), | |
pd.DataFrame([cluster_representative_dict[c] for c in predicted_clusters]), | |
pd.DataFrame([dict_terms[c] for c in predicted_clusters]), | |
pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']), | |
(pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y'])-pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']).rename(columns={'C x': 'x','C y': 'y'})).rename(columns={'x': 'D x','y': 'D y'}) | |
],axis=1) | |
cluster_df.columns = ['sentence','x','y','cluster','cluster rep','tfidf','C x', 'C y','D x', 'D y'] | |
cluster_df['delta center'] = np.sqrt((cluster_df['D x'].multiply(cluster_df['D x'])+cluster_df['D y'].multiply(cluster_df['D y']))/2) | |
print(cluster_df.groupby('cluster')['sentence'].count().sort_values(ascending=False)) | |
cluster_df = cluster_df.sort_values(by=['cluster', 'delta center']) | |
cluster_df.to_csv('clustered_sentences.csv') | |
# In[264]: | |
from IPython.display import display, clear_output | |
# Create widgets | |
cluster_dropdown = widgets.Dropdown( | |
options=[(str(cluster), cluster) for cluster in np.sort(cluster_df['cluster'].unique())], | |
description='Select Cluster:', | |
) | |
# Create widgets | |
head_dropdown = widgets.Dropdown( | |
value=5, | |
options=[1,2,3,5,8,13], | |
description='Select Head filter:', | |
) | |
# Create widgets | |
num_terms_dropdown = widgets.Dropdown( | |
value=5, | |
options=[1,2,3,5,8,13], | |
description='Select terms:', | |
) | |
delta_center_slider = widgets.Dropdown( | |
value=.5, | |
options=[.1,.2,.3,.5,.8,1.3], | |
description='Max Distance:', | |
) | |
output = widgets.Output() | |
def update_scatterplot(selected_cluster, head_filter, max_distance, num_terms): | |
def extract_top_terms_per_cluster(df, num_terms=5): | |
""" | |
Extract top terms per cluster using TF-IDF. | |
:param df: DataFrame with 'sentence' and 'cluster' columns | |
:param num_terms: Number of top terms to extract for each cluster | |
:return: Dictionary with clusters as keys and top terms as values | |
""" | |
# Initialize the TF-IDF Vectorizer | |
vectorizer = TfidfVectorizer(stop_words='english') | |
tfidf_matrix = vectorizer.fit_transform(df['sentence']) | |
feature_names = vectorizer.get_feature_names_out() | |
top_terms_per_cluster = {} | |
for cluster in df['cluster'].unique(): | |
# Subset by partition (cluster) | |
cluster_rows = np.where(df['cluster'] == cluster)[0] | |
# Aggregate TF-IDF scores for each term within the cluster | |
aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1 | |
top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1] | |
top_terms = [feature_names[i] for i in top_indices] | |
top_terms_per_cluster[cluster] = ' '.join(top_terms) | |
return top_terms_per_cluster | |
with output: | |
clear_output(wait=True) | |
# Filtering based on the slider value | |
filtered_df = cluster_df[cluster_df['delta center'] <= max_distance] | |
# Plotting all clusters with filtered points | |
plt.figure(figsize=(10, 6)) | |
plt.scatter(filtered_df['x'], filtered_df['y'], c=filtered_df['cluster'], cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6) | |
# Plotting centroids | |
centroids = kmeans_model.cluster_centers_ | |
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x') | |
# Example usage | |
#filtered_df = cluster_df.query('`delta center` < .50').sort_values(by=['cluster', 'delta center']).groupby('cluster') | |
top_terms = extract_top_terms_per_cluster(filtered_df.groupby('cluster').head(head_filter),num_terms=num_terms) | |
top_terms = list(top_terms.values()) | |
#print(top_terms) | |
# Annotate Centroids | |
for i, centroid in enumerate(centroids): | |
label = top_terms[i] | |
plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='black') | |
# Highlighting the selected cluster | |
selected_cluster_df = filtered_df[filtered_df['cluster'] == selected_cluster] | |
plt.scatter(selected_cluster_df['x'], selected_cluster_df['y'], c='white', s=100, marker='x') | |
# Title and labels | |
plt.title('2D Visualization of KMeans Clustering with Representative Labels') | |
plt.xlabel('UMAP Dimension 1') | |
plt.ylabel('UMAP Dimension 2') | |
plt.show() | |
# Display the filtered and sorted DataFrame | |
display_df = filtered_df.groupby('cluster').head(head_filter).sort_values(by=['cluster', 'delta center'])#.groupby('cluster')#.head(5) | |
display(display_df) | |
# Interactive widget | |
interactive_plot = widgets.interactive(update_scatterplot, selected_cluster=cluster_dropdown, head_filter=head_dropdown, num_terms=num_terms_dropdown, max_distance=delta_center_slider) | |
# Display the interactive components | |
display(interactive_plot, output) | |
# In[258]: | |
# Step 1: Export embeddings and metadata for TensorBoard | |
# Embeddings: reshaped_embeddings_full_array | |
# Metadata: sentences and cluster IDs | |
# Prepare the metadata file with headers, including the "Representative" column | |
metadata_df = pd.DataFrame({ | |
'Sentence': cluster_df['sentence'].str.replace('\t', ' '), | |
'Cluster': cluster_df['cluster'], | |
'Representative': cluster_df['cluster rep'] | |
}) | |
#modify projector_config.pbtxt | |
metadata_file = "/data/sub-sentence-encoder/00000/default/metadata.tsv" | |
metadata_df.to_csv(metadata_file, sep='\t', index=False, header=True) | |
# Save the embeddings to a TSV file | |
embeddings_file = "/data/sub-sentence-encoder/00000/default/embeddings.tsv" | |
with open(embeddings_file, 'w') as f: | |
for emb in embeddings: | |
f.write('\t'.join(map(str, emb)) + '\n') | |
embeddings_file = "/data/sub-sentence-encoder/00000/default/umap_embeddings.tsv" | |
with open(embeddings_file, 'w') as f: | |
for emb in umap_embeddings: | |
f.write('\t'.join(map(str, emb)) + '\n') | |
# Step 2: Set up TensorBoard projector (this step is usually run in the same directory as the TSV files) | |
# Create a log directory for TensorBoard | |
log_dir = "/data/sub-sentence-encoder/logs" | |
if not os.path.exists(log_dir): | |
os.makedirs(log_dir) | |
# Create a projector config | |
config = projector.ProjectorConfig() | |
embedding = config.embeddings.add() | |
embedding.tensor_name = 'umap_embeddings' | |
embedding.metadata_path = metadata_file | |
# Save the projector config | |
projector.visualize_embeddings(log_dir, config) | |
metadata = pd.read_csv("/data/sub-sentence-encoder/00000/default/metadata.tsv",delimiter='\t') | |
cluster_df.columns | |
try: | |
cluster_df.drop(columns=0,inplace=True) | |
except: | |
pass | |
cluster_df = pd.concat([cluster_df.sort_index(),pd.DataFrame([e] for e in embeddings)],axis=1) | |
representative_labels_full = dict(zip(np.unique(clusters),representative_labels)) | |
# Select representative embeddings | |
representative_embeddings = cluster_df.query('sentence in @representative_labels')[[0,'sentence','cluster']] | |
#representative_embeddings = cluster_df.query('sentence in @representative_labels')[['x','y','sentence',0]] | |
#list_of_2d_arrays = [np.array([row['x'], row['y']]) for index, row in filtered_cluster_df.iterrows()] | |
combined_embeddings = [] | |
for (emb1, emb2) in combinations(representative_embeddings[0], 2): | |
combined_embeddings.append(emb1 + emb2) # Simple addition, modify as needed | |
# Perform vector arithmetic (e.g., pairwise addition) | |
# Convert PyTorch tensors to NumPy arrays and ensure uniform shape | |
combined_embeddings_np = [emb.numpy() if isinstance(emb, torch.Tensor) else emb for emb in combined_embeddings] | |
# Check if all embeddings have the same shape | |
embedding_shapes = [emb.shape for emb in combined_embeddings_np] | |
if len(set(embedding_shapes)) != 1: | |
raise ValueError("Not all embeddings have the same shape.") | |
# Now transform with UMAP | |
umap_combined_embeddings = reducer.transform(np.array(combined_embeddings_np)) | |
predicted_clusters_for_combined = kmeans_model.predict(umap_combined_embeddings) | |
# Find the closest representatives for each combined embedding, along with distances | |
closest_representatives_for_combined = [] | |
distances_to_centroid = [] # Store distances here | |
for idx, emb in enumerate(umap_combined_embeddings): | |
cluster = predicted_clusters_for_combined[idx] # Get the cluster of the combined embedding | |
centroid = kmeans_model.cluster_centers_[cluster] | |
distance = pairwise_distances([centroid], [emb])[0][0] # Calculate distance to centroid | |
representative_label = representative_labels_full[cluster] | |
closest_representatives_for_combined.append(representative_label) | |
distances_to_centroid.append(distance) # Append the calculated distance | |
for label, distance in zip(closest_representatives_for_combined, distances_to_centroid): | |
print(f"Representative Sentence: {label}, Distance from neighbor: {distance}") | |
# In[251]: | |
# In[255]: | |
pwd | |
# In[254]: | |
np.unique([*np.unique(closest_representatives_for_combined),*np.unique(cluster_df['cluster rep'])]) | |
# In[260]: | |
#cat projector_config.pbtxt | |
""" | |
embeddings { | |
tensor_name: "default:00000" | |
metadata_path: "00000/default/metadata.tsv" | |
tensor_path: "00000/default/tensors.tsv" | |
tensor_name: "default:00001" | |
metadata_path: "00000/default/metadata.tsv" | |
tensor_path: "00000/default/embeddings.tsv" | |
} | |
""" | |
#tensorboard.sh | |
""" | |
tensorboard --logdir=./ --host 0.0.0.0 | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
select, edit label of selection, download
reload tensoboard, select, don't label, download
compare two downloads side by side in excel with filtering one of the columns for the label you set.
and use as an example of what kind of inferences you can get from a RAG + LLM setup
tensorflow/tensorboard#820 (comment)