|
def warn(*args, **kwargs): |
|
pass |
|
import warnings |
|
warnings.warn = warn |
|
|
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
import json |
|
from sklearn.cluster import KMeans |
|
from sklearn.decomposition import PCA # Optional: For reducing dimensions |
|
from sentence_transformers import SentenceTransformer |
|
from typing import List, Dict, Any, Callable |
|
import llm |
|
import miro |
|
|
|
def embed_items(items, embedder): |
|
""" |
|
Embeds given text items into numerical vectors using the specified embedding model. |
|
""" |
|
return embedder.encode(items) |
|
|
|
def cluster_embeddings(embeddings, n_clusters): |
|
""" |
|
Clusters embeddings into the specified number of categories using the given clustering algorithm. |
|
""" |
|
return cluster_algo(n_clusters=n_clusters).fit_predict(embeddings) |
|
|
|
def prep_output(items, labels): |
|
return [{"item": item, "category": str(label)} for item, label in zip(items, labels)] |
|
|
|
def output_json(items, labels): |
|
""" |
|
Creates a JSON string mapping items to their cluster labels. |
|
""" |
|
return json.dumps(prep_output(items,labels), indent=2) |
|
|
|
def group(data): |
|
from collections import defaultdict |
|
|
|
# Create a defaultdict to store the groups |
|
category_groups = defaultdict(list) |
|
|
|
# Iterate over each dictionary in the list |
|
for item in data: |
|
# Append the item to the list of its corresponding category |
|
category_groups[item['category']].append(item) |
|
|
|
# category_groups is now a dict with categories as keys and lists of items as values |
|
# Convert defaultdict to a regular dict for output if necessary |
|
category_groups = dict(category_groups) |
|
|
|
return category_groups |
|
|
|
def summarize(items, post_miro=False): |
|
#print(items) |
|
out_items = "\n".join(items) |
|
model = llm.get_model("gpt-4") |
|
prompt_text = f"""Give me a short description of what is common across the lines of text after the colon. |
|
You should respond with no more than a five word description of what would be a good summary categorical name for these items: |
|
{out_items}""" |
|
response = model.prompt(prompt_text) |
|
#print(response.text()) |
|
return(response.text()) |
|
|
|
def main(items: List[str], n_clusters: int): |
|
""" |
|
Main function to run the program. |
|
""" |
|
# Embedding options |
|
embedder = SentenceTransformer('all-mpnet-base-v2') |
|
|
|
# Embed items |
|
embeddings = embed_items(items, embedder) |
|
|
|
# Optional: Reduce dimensions for faster processing in clustering if embeddings are very large |
|
pca = PCA(n_components=25) # Arbitrarily chosen component number |
|
embeddings = pca.fit_transform(embeddings) |
|
|
|
# Clustering options |
|
cluster_algo = KMeans # Example clustering model, replace or modify as needed |
|
|
|
# Cluster embeddings |
|
labels = cluster_embeddings(embeddings, n_clusters, cluster_algo) |
|
|
|
categories = prep_output(items, labels) |
|
return categories |
|
|
|
|
|
def to_miro(named_categories, board): |
|
|
|
circle_points = miro.generate_circle_points(len(named_categories)) |
|
|
|
for index, (category, items) in enumerate(named_categories.items()): |
|
# Assuming post_to_miro function exists: posting category name |
|
# Assumption: category names are used as titles here instead of 'item.label'. |
|
miro.post_to_miro(board_id=board, message=category, center=circle_points[index], jitter=False, color="light_blue") |
|
|
|
# Looping through items within each category |
|
for item in items: |
|
item_name = item['item'] # get the 'item' field from the dictionary |
|
miro.post_to_miro(board_id=board, message=item_name, center=circle_points[index]) |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
# Example data |
|
sample_items = [] |
|
with open(sys.argv[1], 'r') as file: |
|
for line in file: |
|
sample_items.append(line.strip()) |
|
number_of_clusters = 9 # Example number, adjust based on needs |
|
|
|
board = sys.argv[2] |
|
|
|
categorized = main(sample_items, number_of_clusters) # Numbered cluster categories |
|
|
|
grouped = group(categorized) # group as a dictionary by category |
|
|
|
named_categories = {} # LLM named categories |
|
to_summarize = [] |
|
|
|
for key in grouped.keys(): |
|
to_summarize = [] |
|
for record in grouped[key]: |
|
to_summarize.append(record['item']) |
|
named_categories[summarize(to_summarize)] = grouped[key] |
|
|
|
to_miro(named_categories, board=board) |
|
print(json.dumps(named_categories, indent=2)) |