thistleknot/tensorboard.py

## tensorboard.py
#!/usr/bin/env python
# coding: utf-8

import lade
from transformers import AutoTokenizer, AutoModel
import torch
lade.augment_all()
lade.config_lade(LEVEL=5, WINDOW_SIZE=7, GUESS_SET_SIZE=7, DEBUG=0)
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
#import tensorflow as tf
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from torch.utils.tensorboard import SummaryWriter
import torch
from tqdm import tqdm
import numpy as np
# Load a pre-trained Sentence Transformer model
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import silhouette_score
#import umap
import umap
from sklearn.cluster import KMeans
from itertools import product, combinations
import random
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import umap

import random
import dask
from dask.distributed import Client, as_completed
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances# Step 1: Map clusters to sentences
import os
import pandas as pd
import tensorflow as tf
from tensorboard.plugins import projector
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import nltk
#nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import re

sample_size = 1000
batch_size = 500
sequence_length = 128
stride_length = 64

def filter_out_pos_tags_stopwords_contractions(texts, disallowed_postags=['IN', 'DT', 'CC', 'TO']):
    additional_words_to_remove = ['we', 'be', 'you']
    filtered_texts = []
    for text in texts:
        doc = nltk.pos_tag(nltk.word_tokenize(text))
        filtered_words = []
        for word, tag in doc:
            if tag not in disallowed_postags and word.lower() not in stop_words and "'" not in word and word.lower() not in additional_words_to_remove:
                filtered_words.append(word)
        filtered_texts.append(" ".join(filtered_words))
    return filtered_texts

#improved version that does batching
def calculate_sentence_statistics_batched(model, tokenizer, texts, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Map each text to its strides
    text_to_strides = {}
    all_strides = []
    for text in texts:
        tokens = tokenizer.encode(text)
        text_strides = []
        for i in range(0, len(tokens), stride_length):
            sequence_tokens = tokens[i:i + sequence_length]
            if len(sequence_tokens) < sequence_length:
                sequence_tokens += [tokenizer.pad_token_id] * (sequence_length - len(sequence_tokens))
            stride = tokenizer.decode(sequence_tokens)
            text_strides.append(stride)
            all_strides.append(stride)
        text_to_strides[text] = text_strides

    # Process strides in batches and collect embeddings
    embeddings = []
    for i in tqdm(range(0, len(all_strides), batch_size), position=0, leave=True):
        batch_strides = all_strides[i:i + batch_size]
        encoded_input = tokenizer(batch_strides, padding=True, truncation=True, max_length=sequence_length, return_tensors='pt').to(device)

        with torch.no_grad():
            model_output = model(**encoded_input)

        batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
        attention_masks = encoded_input['attention_mask'].cpu().numpy()
        embeddings.extend(zip(batch_embeddings, attention_masks))

    # Associate embeddings back to texts
    offset = 0
    sentence_embeddings_dict = {}
    for text, strides in text_to_strides.items():
        stride_embeddings_with_masks = embeddings[offset:offset + len(strides)]
        offset += len(strides)

        if stride_embeddings_with_masks:
            ema_embedding, _ = stride_embeddings_with_masks[0]
            alpha = 2.0 / (len(stride_embeddings_with_masks) + 1)
            for i, (embedding, mask) in enumerate(stride_embeddings_with_masks[1:]):
                if i == len(stride_embeddings_with_masks) - 2:  # Check if it's the last stride
                    weight = mask.sum() / sequence_length
                    adjusted_alpha = alpha * weight
                else:
                    adjusted_alpha = alpha
                ema_embedding = adjusted_alpha * embedding + (1 - adjusted_alpha) * ema_embedding
            sentence_embeddings_dict[text] = ema_embedding

    return sentence_embeddings_dict

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def apply_umap(embeddings, n_neighbors, min_dist):
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist)
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

@dask.delayed
def process_single_combination(embeddings, params):
    n_neighbors, min_dist, n_clusters = params
    reduced_embeddings = apply_umap(embeddings, n_neighbors, min_dist)
    #bss_tss_ratio = calculate_bss_tss_ratio(reduced_embeddings, n_clusters)
    bss_tss_ratio = calculate_bss_tss_ratio_approx(reduced_embeddings, n_clusters)
    return bss_tss_ratio, params

def calculate_bss_tss_ratio_approx(embeddings, n_clusters, batch_size=batch_size):
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42, init='k-means++',n_init='auto')
    labels = kmeans.fit_predict(embeddings)

    # Calculate the Total Sum of Squares
    centroid = np.mean(embeddings, axis=0)
    tss = np.sum((embeddings - centroid) ** 2)

    # Calculate the Between Cluster Sum of Squares
    bss = sum(np.sum((embeddings[labels == i] - kmeans.cluster_centers_[i]) ** 2) for i in range(n_clusters))

    # Ratio of BSS to TSS
    bss_tss_ratio = bss / tss if tss > 0 else 0
    return bss_tss_ratio

def cross_validation(embeddings, parameter_combinations, outer_k=5):
    warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")
    outer_cv = KFold(n_splits=outer_k, shuffle=True, random_state=42)
    best_score = -1
    best_params = None
    delayed_results = []

    for train_idx, _ in outer_cv.split(embeddings):
        train_embeddings = [embeddings[i] for i in train_idx]

        for params in parameter_combinations:
            # Create a delayed task for each parameter combination
            delayed_result = process_single_combination(train_embeddings, params)
            delayed_results.append(delayed_result)

    # Compute all tasks in parallel
    results = dask.compute(*delayed_results)

    # Find the best parameter combination
    for score, params in results:
        if score > best_score:
            best_score = score
            best_params = params

    return best_params, best_score

def find_representative_labels(kmeans_model, labels, embeddings):
    warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")
    centroids = kmeans_model.cluster_centers_
    # Find the closest point to each centroid
    closest, _ = pairwise_distances_argmin_min(centroids, embeddings)
    # Map these points to their corresponding labels
    representative_labels = [labels[idx] for idx in closest]
    return representative_labels

# Function to update the scatterplot based on the selected cluster
def update_scatterplot(selected_cluster):
    plt.figure(figsize=(8, 6))
    plt.scatter(cluster_df.iloc[:, 0], cluster_df.iloc[:, 1], c=cluster_df['Cluster'], cmap='viridis')
    plt.xlabel('Column 0')
    plt.ylabel('Column 1')
    plt.colorbar(label='Cluster')
    plt.title(f'Scatterplot with Cluster {selected_cluster} Highlighted', fontsize=14)

    # Highlight the selected cluster
    selected_cluster_indices = cluster_df.loc[cluster_df['Cluster'] == selected_cluster]
    plt.scatter(selected_cluster_indices.iloc[:, 0], selected_cluster_indices.iloc[:, 1], c='red', label=f'Cluster {selected_cluster}')
    plt.legend()

    plt.show()

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

dataset = load_dataset("Abirate/english_quotes")
quotes = [item['quote'] for item in dataset['train']]

sentence_statistics = calculate_sentence_statistics_batched(model, tokenizer, quotes)
embeddings = [sentence_statistics[k] for k in list(sentence_statistics.keys())]

metadata_labels = sentence_statistics.keys()

# Convert to tensor
embeddings_tensor = torch.tensor(embeddings)

# Optionally create metadata labels for each sentence
metadata_labels = list(sentence_statistics.keys())

# TensorBoard Logging

# Ensure you're using the reshaped tensor
reshaped_embeddings = embeddings_tensor.squeeze(1)

# If adding embeddings multiple times, use a unique global_step each time
# For example:

writer = SummaryWriter('./')

global_step = 0  # Increment this appropriately in your actual code

# Use the reshaped tensor in the call
writer.add_embedding(reshaped_embeddings, metadata=metadata_labels, global_step=global_step)

# Close the writer when done
writer.close()

#run in shell
#!tensorboard --logdir=./ --host 0.0.0.0

labels_embeddings = dict(zip(metadata_labels,embeddings_tensor))

random_sample = dict(random.sample(labels_embeddings.items(), sample_size))

labels_embeddings_sample = [[key, random_sample[key]] for key in random_sample.keys()]

dict_labels_embeddings_sample = {}
for e in labels_embeddings_sample:
    # Use the update method to add key-value pairs to the new dictionary
    dict_labels_embeddings_sample.update({e[0]: e[1]})

embeddings_sample = [dict_labels_embeddings_sample[k] for k in list(dict_labels_embeddings_sample.keys())]
labels_sample = [k for k in list(dict_labels_embeddings_sample.keys())]

# Assuming embeddings_sample is a list of PyTorch tensors
embeddings_sample_numpy = [e.numpy() for e in embeddings_sample]

# Now convert the list of NumPy arrays to a single NumPy array
embeddings_array = np.array(embeddings_sample_numpy)

# Define the hyperparameter space
if(False):
    n_neighbors_values = [5, 8, 13, 21, 34, 55, 89]
    min_dist_values = [.05, .08, .13, .21, .34, .55, .89]
    n_clusters_values = [5, 8, 13, 21, 34, 55, 89]
else:
    n_neighbors_values = [8, 13, 21, 34, 55]
    min_dist_values = [.13, .21, .34, .55, .89]
    n_clusters_values = [13, 21, 34, 55, 89]

# Generate the parameter grid

parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values))

# Randomly sample 25 combinations

# Generate the parameter grid
parameter_grid = list(product(n_neighbors_values, min_dist_values, n_clusters_values))

# Randomly sample 25 combinations

sampled_combinations = random.sample(parameter_grid, 125)

# Initialize a Dask client
client = Client(n_workers=8)

# Now use 'reshaped_embeddings' in your nested cross-validation function
best_params, best_score = cross_validation(embeddings_array, sampled_combinations)

client.close()

# In[261]:

best_params

# Function to filter out specific POS tags and stop words
def filter_out_pos_tags_and_stopwords(texts, disallowed_postags=['IN', 'CD', 'DT', 'EX', 'CC', 'TO','WP','RP','PRP$','PRP']):
    filtered_texts = []
    for text in texts:
        doc = nltk.pos_tag(nltk.word_tokenize(text))
        filtered_words = []
        for word, tag in doc:
            if tag not in disallowed_postags and word.lower() not in stop_words:
                filtered_words.append(word)
        filtered_texts.append(" ".join(filtered_words))
    return filtered_texts

# Define a function to extract a single text span for each representative label
def extract_single_text_span(representative_labels, top_words, span_length=20):
    extracted_spans = []  # To store the extracted text spans

    for i, top_word_list in enumerate(top_words):
        label = representative_labels[i]  # Get the representative label

        # Initialize variables to store the positions of top words
        word_positions = {}

        # Find the position of each top word in the representative label
        for top_word in top_word_list:
            position = label.find(top_word)
            if position != -1:
                word_positions[top_word] = position

        # Check if any top words were found in the label
        if word_positions:
            # Get the start and end positions to create the span
            start = min(word_positions.values()) - span_length // 2
            end = max(word_positions.values()) + len(max(word_positions, key=len)) + span_length // 2
            # Ensure the start and end positions are within bounds
            start = max(0, start)
            end = min(len(label), end)
            # Extract the text span
            span = label[start:end]
            extracted_spans.append(span)

    return extracted_spans

def extract_center_text(representative_labels, top_words, span_length=30):
    extracted_centers = []  # To store the extracted center text

    for i, top_word_list in enumerate(top_words):
        label = representative_labels[i]  # Get the representative label

        # Initialize variables to store the positions of top words
        word_positions = {}

        # Find the position of each top word in the representative label
        for top_word in top_word_list:
            position = label.find(top_word)
            if position != -1:
                word_positions[top_word] = position

        # Check if any top words were found in the label
        if word_positions:
            # Get the center position
            center_position = sum(word_positions.values()) // len(word_positions)

            # Get the start and end positions to create the span
            start = max(0, center_position - span_length // 2)
            end = min(len(label), center_position + span_length // 2)

            # Extract the center text
            center_text = label[start:end]
            extracted_centers.append(center_text)

    return extracted_centers

def extract_top_terms_per_cluster(df, num_terms=7):
    """
    Extract top terms per cluster using TF-IDF.

    :param df: DataFrame with 'sentence' and 'cluster' columns
    :param num_terms: Number of top terms to extract for each cluster
    :return: Dictionary with clusters as keys and top terms as values
    """
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['sentence'])
    feature_names = vectorizer.get_feature_names_out()

    top_terms_per_cluster = {}
    for cluster in df['cluster'].unique():
        # Subset by partition (cluster)
        cluster_rows = np.where(df['cluster'] == cluster)[0]

        # Aggregate TF-IDF scores for each term within the cluster
        aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1
        top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1]
        top_terms = [feature_names[i] for i in top_indices]

        top_terms_per_cluster[cluster] = ' '.join(top_terms)

    return top_terms_per_cluster

# In[262]:

reducer = umap.UMAP(n_neighbors=best_params[0], min_dist=best_params[1])

reducer.fit(np.array(reshaped_embeddings))
umap_embeddings = reducer.transform(np.array(reshaped_embeddings))

labels_umap_embeddings = dict(zip(list(labels_embeddings.keys()),umap_embeddings))

kmeans_model = KMeans(n_clusters=best_params[2], random_state=42)

kmeans_model.fit(umap_embeddings)
predicted_clusters = kmeans_model.fit_predict(umap_embeddings)

# Assume kmeans_model is your trained KMeans model
representative_labels = find_representative_labels(kmeans_model, list(labels_umap_embeddings.keys()), embeddings = np.array(list(labels_umap_embeddings.values())))

# Create a mapping from the original labels to the UMAP embeddings
labels_umap_embeddings_full = dict(zip(labels_embeddings.keys(), umap_embeddings))

# Combine labels for each centroid into a single string

# Step 2: Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(representative_labels)

# Adding sklearn's default list of stopwords to your vectorizer
stop_words = list(text.ENGLISH_STOP_WORDS)

# Filter the text
filtered_labels = filter_out_pos_tags_stopwords_contractions(representative_labels)

top_terms = []

if(True):
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(filtered_labels)

    top_terms = []
    feature_names = vectorizer.get_feature_names_out()

    #for i in range(0,len(tfidf_matrix)
    for i in range(0,len(representative_labels)):
        items = list(set(representative_labels[i].split()))
        d = pd.DataFrame(tfidf_matrix.getrow(i).toarray(),columns=feature_names)
        selected = d[d!=0.0].dropna(axis=1).T.reset_index().sort_values(by=0,ascending=False).head(7)['index'].values
        matches = []
        for s in selected:
            for i in items:
                if(re.match(r"\b"+s.lower()+r"\b",i.lower())):
                    matches.append(s)
        string = list(dict.fromkeys(matches))
        top_terms.append(' '.join(string))

    # Call the function to extract single text spans
    #single_text_spans = extract_single_text_span(representative_labels, top_terms)

    # Call the function to extract the center 15 characters for each representative label
    #center_text_spans = extract_center_text(representative_labels, top_terms)

# Now representative_labels_full holds the labels that are representative for each cluster

# Plotting clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=predicted_clusters, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6, label=[representative_labels[cluster] for cluster in clusters])

# Plotting centroids
centroids = kmeans_model.cluster_centers_

cluster_representative_dict = dict(zip(np.unique(predicted_clusters),representative_labels))

plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x')

# Annotate Centroids in the plot
for i, centroid in enumerate(centroids):
    label = top_terms[i]
    plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='red')

# Title and labels
plt.title('2D visualization of KMeans clustering with representative labels')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')

# Creating a legend
plt.legend(*scatter.legend_elements(), title="Clusters")

plt.show()

# Calculate the overall mean of the data
overall_mean = np.mean(umap_embeddings, axis=0)

# Calculate Total Sum of Squares (TSS)
tss = np.sum((umap_embeddings - overall_mean) ** 2)

# Calculate Between Sum of Squares (BSS)
bss = sum(np.sum((umap_embeddings[predicted_clusters == i] - centroids[i]) ** 2) for i in range(best_params[2]))

print("Total Sum of Squares (TSS):", tss)
print("Between Sum of Squares (BSS):", bss)

centroids = kmeans_model.cluster_centers_
dict_centroids = dict(zip(np.unique(predicted_clusters),centroids))
dict_terms = dict(zip(np.unique(predicted_clusters),top_terms))
cluster_df = pd.concat([
    pd.DataFrame(list(labels_umap_embeddings_full.keys())),
    pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y']),
    pd.DataFrame(predicted_clusters),
    pd.DataFrame([cluster_representative_dict[c] for c in predicted_clusters]),
    pd.DataFrame([dict_terms[c] for c in predicted_clusters]),
    pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']),
    (pd.DataFrame(list(labels_umap_embeddings_full.values()),columns=['x','y'])-pd.DataFrame([dict_centroids[c] for c in predicted_clusters],columns=['C x', 'C y']).rename(columns={'C x': 'x','C y': 'y'})).rename(columns={'x': 'D x','y': 'D y'})
],axis=1)
cluster_df.columns = ['sentence','x','y','cluster','cluster rep','tfidf','C x', 'C y','D x', 'D y']
cluster_df['delta center'] = np.sqrt((cluster_df['D x'].multiply(cluster_df['D x'])+cluster_df['D y'].multiply(cluster_df['D y']))/2)

print(cluster_df.groupby('cluster')['sentence'].count().sort_values(ascending=False))
cluster_df = cluster_df.sort_values(by=['cluster', 'delta center'])
cluster_df.to_csv('clustered_sentences.csv')

# In[264]:

from IPython.display import display, clear_output

# Create widgets
cluster_dropdown = widgets.Dropdown(
    options=[(str(cluster), cluster) for cluster in np.sort(cluster_df['cluster'].unique())],
    description='Select Cluster:',
)

# Create widgets
head_dropdown = widgets.Dropdown(
    value=5,
    options=[1,2,3,5,8,13],
    description='Select Head filter:',
)

# Create widgets
num_terms_dropdown = widgets.Dropdown(
    value=5,
    options=[1,2,3,5,8,13],
    description='Select terms:',
)

delta_center_slider = widgets.Dropdown(
    value=.5,
    options=[.1,.2,.3,.5,.8,1.3],
    description='Max Distance:',
)

output = widgets.Output()

def update_scatterplot(selected_cluster, head_filter, max_distance, num_terms):
    def extract_top_terms_per_cluster(df, num_terms=5):
        """
        Extract top terms per cluster using TF-IDF.

        :param df: DataFrame with 'sentence' and 'cluster' columns
        :param num_terms: Number of top terms to extract for each cluster
        :return: Dictionary with clusters as keys and top terms as values
        """
        # Initialize the TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(df['sentence'])
        feature_names = vectorizer.get_feature_names_out()

        top_terms_per_cluster = {}
        for cluster in df['cluster'].unique():
            # Subset by partition (cluster)
            cluster_rows = np.where(df['cluster'] == cluster)[0]

            # Aggregate TF-IDF scores for each term within the cluster
            aggregated_tfidf = tfidf_matrix[cluster_rows].sum(axis=0).A1
            top_indices = aggregated_tfidf.argsort()[-num_terms:][::-1]
            top_terms = [feature_names[i] for i in top_indices]

            top_terms_per_cluster[cluster] = ' '.join(top_terms)

        return top_terms_per_cluster

    with output:
        clear_output(wait=True)
        # Filtering based on the slider value
        filtered_df = cluster_df[cluster_df['delta center'] <= max_distance]

        # Plotting all clusters with filtered points
        plt.figure(figsize=(10, 6))
        plt.scatter(filtered_df['x'], filtered_df['y'], c=filtered_df['cluster'], cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.6)

        # Plotting centroids
        centroids = kmeans_model.cluster_centers_
        plt.scatter(centroids[:, 0], centroids[:, 1], c='black', s=300, marker='x')

        # Example usage
        #filtered_df = cluster_df.query('`delta center` < .50').sort_values(by=['cluster', 'delta center']).groupby('cluster')
        top_terms = extract_top_terms_per_cluster(filtered_df.groupby('cluster').head(head_filter),num_terms=num_terms)
        top_terms = list(top_terms.values())
        #print(top_terms)

        # Annotate Centroids
        for i, centroid in enumerate(centroids):
            label = top_terms[i]
            plt.annotate(label, (centroid[0], centroid[1]), textcoords="offset points", xytext=(0,10), ha='center', color='black')

        # Highlighting the selected cluster
        selected_cluster_df = filtered_df[filtered_df['cluster'] == selected_cluster]
        plt.scatter(selected_cluster_df['x'], selected_cluster_df['y'], c='white', s=100, marker='x')

        # Title and labels
        plt.title('2D Visualization of KMeans Clustering with Representative Labels')
        plt.xlabel('UMAP Dimension 1')
        plt.ylabel('UMAP Dimension 2')

        plt.show()

        # Display the filtered and sorted DataFrame
        display_df = filtered_df.groupby('cluster').head(head_filter).sort_values(by=['cluster', 'delta center'])#.groupby('cluster')#.head(5)
        display(display_df)

# Interactive widget
interactive_plot = widgets.interactive(update_scatterplot, selected_cluster=cluster_dropdown, head_filter=head_dropdown, num_terms=num_terms_dropdown, max_distance=delta_center_slider)

# Display the interactive components
display(interactive_plot, output)

# In[258]:

# Step 1: Export embeddings and metadata for TensorBoard
# Embeddings: reshaped_embeddings_full_array
# Metadata: sentences and cluster IDs

# Prepare the metadata file with headers, including the "Representative" column
metadata_df = pd.DataFrame({
    'Sentence': cluster_df['sentence'].str.replace('\t', '   '),
    'Cluster': cluster_df['cluster'],
    'Representative': cluster_df['cluster rep']
})
#modify projector_config.pbtxt
metadata_file = "/data/sub-sentence-encoder/00000/default/metadata.tsv"
metadata_df.to_csv(metadata_file, sep='\t', index=False, header=True)

# Save the embeddings to a TSV file

embeddings_file = "/data/sub-sentence-encoder/00000/default/embeddings.tsv"
with open(embeddings_file, 'w') as f:
    for emb in embeddings:
        f.write('\t'.join(map(str, emb)) + '\n')

embeddings_file = "/data/sub-sentence-encoder/00000/default/umap_embeddings.tsv"
with open(embeddings_file, 'w') as f:
    for emb in umap_embeddings:
        f.write('\t'.join(map(str, emb)) + '\n')

# Step 2: Set up TensorBoard projector (this step is usually run in the same directory as the TSV files)

# Create a log directory for TensorBoard
log_dir = "/data/sub-sentence-encoder/logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Create a projector config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'umap_embeddings'
embedding.metadata_path = metadata_file

# Save the projector config
projector.visualize_embeddings(log_dir, config)

metadata = pd.read_csv("/data/sub-sentence-encoder/00000/default/metadata.tsv",delimiter='\t')

cluster_df.columns

try:
    cluster_df.drop(columns=0,inplace=True)
except:
    pass
cluster_df = pd.concat([cluster_df.sort_index(),pd.DataFrame([e] for e in embeddings)],axis=1)

representative_labels_full = dict(zip(np.unique(clusters),representative_labels))

# Select representative embeddings
representative_embeddings = cluster_df.query('sentence in @representative_labels')[[0,'sentence','cluster']]
#representative_embeddings = cluster_df.query('sentence in @representative_labels')[['x','y','sentence',0]]

#list_of_2d_arrays = [np.array([row['x'], row['y']]) for index, row in filtered_cluster_df.iterrows()]

combined_embeddings = []
for (emb1, emb2) in combinations(representative_embeddings[0], 2):
    combined_embeddings.append(emb1 + emb2)  # Simple addition, modify as needed

# Perform vector arithmetic (e.g., pairwise addition)
# Convert PyTorch tensors to NumPy arrays and ensure uniform shape
combined_embeddings_np = [emb.numpy() if isinstance(emb, torch.Tensor) else emb for emb in combined_embeddings]

# Check if all embeddings have the same shape
embedding_shapes = [emb.shape for emb in combined_embeddings_np]
if len(set(embedding_shapes)) != 1:
    raise ValueError("Not all embeddings have the same shape.")

# Now transform with UMAP
umap_combined_embeddings = reducer.transform(np.array(combined_embeddings_np))

predicted_clusters_for_combined = kmeans_model.predict(umap_combined_embeddings)

# Find the closest representatives for each combined embedding, along with distances
closest_representatives_for_combined = []
distances_to_centroid = []  # Store distances here

for idx, emb in enumerate(umap_combined_embeddings):
    cluster = predicted_clusters_for_combined[idx]  # Get the cluster of the combined embedding
    centroid = kmeans_model.cluster_centers_[cluster]
    distance = pairwise_distances([centroid], [emb])[0][0]  # Calculate distance to centroid
    representative_label = representative_labels_full[cluster]

    closest_representatives_for_combined.append(representative_label)
    distances_to_centroid.append(distance)  # Append the calculated distance

for label, distance in zip(closest_representatives_for_combined, distances_to_centroid):
    print(f"Representative Sentence: {label}, Distance from neighbor: {distance}")

# In[251]:

# In[255]:

pwd

# In[254]:

np.unique([*np.unique(closest_representatives_for_combined),*np.unique(cluster_df['cluster rep'])])

# In[260]:

#cat projector_config.pbtxt
"""
embeddings {
  tensor_name: "default:00000"
  metadata_path: "00000/default/metadata.tsv"
  tensor_path: "00000/default/tensors.tsv"
  tensor_name: "default:00001"
  metadata_path: "00000/default/metadata.tsv"
  tensor_path: "00000/default/embeddings.tsv"
}
"""

#tensorboard.sh
"""
tensorboard --logdir=./ --host 0.0.0.0
"""