Skip to content

Instantly share code, notes, and snippets.

@jduckles
Created May 21, 2024 18:52
Show Gist options
  • Save jduckles/f393b4cb9b24247b4c639c1108829687 to your computer and use it in GitHub Desktop.
Save jduckles/f393b4cb9b24247b4c639c1108829687 to your computer and use it in GitHub Desktop.
May 21, 2024 Oblique Thinking Hour Clustering of Group Concept Mapping responses

A quick and dirty exploration of using LLMs and embeddings to classify and cluster text. Tested during the Oblique Thinking Hour of Organizational Mycology on 2024-05-21.

Approach:

  1. Take a vector (list) of text (responses, one per line).
  2. Present each line to an “embedding” algorithm. This takes the text and places it in a multi-dimensional space. Related text (conceptually) tends to cluster with other text in this “hyper-space”
  3. Collapse the hyperspace to a smaller dimensional space (Principal Component Analysis)
  4. Cluster (simple method with k-means here today)
  5. Feed the contents of the cluster (each item) to an LLM with the following prompt:
  • Give me a short description of what is common across the lines of text after the colon. You should respond with no more than a five word description of what would be a good summary categorical name for these items:
    {out_items}
pip install scikit-learn requests llm sentence_transformers dotenv

To paste to Miro you'll need to create a .env file with

MIRO_TOKEN=<your miro token>

Run like:

python cluster.py items.txt <target miroboard id>
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import json
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA # Optional: For reducing dimensions
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Callable
import llm
import miro
def embed_items(items, embedder):
"""
Embeds given text items into numerical vectors using the specified embedding model.
"""
return embedder.encode(items)
def cluster_embeddings(embeddings, n_clusters):
"""
Clusters embeddings into the specified number of categories using the given clustering algorithm.
"""
return cluster_algo(n_clusters=n_clusters).fit_predict(embeddings)
def prep_output(items, labels):
return [{"item": item, "category": str(label)} for item, label in zip(items, labels)]
def output_json(items, labels):
"""
Creates a JSON string mapping items to their cluster labels.
"""
return json.dumps(prep_output(items,labels), indent=2)
def group(data):
from collections import defaultdict
# Create a defaultdict to store the groups
category_groups = defaultdict(list)
# Iterate over each dictionary in the list
for item in data:
# Append the item to the list of its corresponding category
category_groups[item['category']].append(item)
# category_groups is now a dict with categories as keys and lists of items as values
# Convert defaultdict to a regular dict for output if necessary
category_groups = dict(category_groups)
return category_groups
def summarize(items, post_miro=False):
#print(items)
out_items = "\n".join(items)
model = llm.get_model("gpt-4")
prompt_text = f"""Give me a short description of what is common across the lines of text after the colon.
You should respond with no more than a five word description of what would be a good summary categorical name for these items:
{out_items}"""
response = model.prompt(prompt_text)
#print(response.text())
return(response.text())
def main(items: List[str], n_clusters: int):
"""
Main function to run the program.
"""
# Embedding options
embedder = SentenceTransformer('all-mpnet-base-v2')
# Embed items
embeddings = embed_items(items, embedder)
# Optional: Reduce dimensions for faster processing in clustering if embeddings are very large
pca = PCA(n_components=25) # Arbitrarily chosen component number
embeddings = pca.fit_transform(embeddings)
# Clustering options
cluster_algo = KMeans # Example clustering model, replace or modify as needed
# Cluster embeddings
labels = cluster_embeddings(embeddings, n_clusters, cluster_algo)
categories = prep_output(items, labels)
return categories
def to_miro(named_categories, board):
circle_points = miro.generate_circle_points(len(named_categories))
for index, (category, items) in enumerate(named_categories.items()):
# Assuming post_to_miro function exists: posting category name
# Assumption: category names are used as titles here instead of 'item.label'.
miro.post_to_miro(board_id=board, message=category, center=circle_points[index], jitter=False, color="light_blue")
# Looping through items within each category
for item in items:
item_name = item['item'] # get the 'item' field from the dictionary
miro.post_to_miro(board_id=board, message=item_name, center=circle_points[index])
if __name__ == "__main__":
import sys
# Example data
sample_items = []
with open(sys.argv[1], 'r') as file:
for line in file:
sample_items.append(line.strip())
number_of_clusters = 9 # Example number, adjust based on needs
board = sys.argv[2]
categorized = main(sample_items, number_of_clusters) # Numbered cluster categories
grouped = group(categorized) # group as a dictionary by category
named_categories = {} # LLM named categories
to_summarize = []
for key in grouped.keys():
to_summarize = []
for record in grouped[key]:
to_summarize.append(record['item'])
named_categories[summarize(to_summarize)] = grouped[key]
to_miro(named_categories, board=board)
print(json.dumps(named_categories, indent=2))
Cellular phone networks
technology that supports our modern way of life
Accessible Wifi, Access to AI tools, Digital Skills Training
JupyterHub
static website creation (quarto, hugo), everything framasoft is doing (mailing list, peertube, mobilizon, forums, mattermost instance,...) , cryptpad.fr, mastodon,
Conda Forge
Jupyter
easy to use, integrated into regular daily tasks/work (not an extra step)
connects people over common interests/needs without feeling forced
holds knowledge and transfers it to the next/new person
Software
Software infrastructure and website hosting infrastructure
People being able to communicate across distances and time zones.
Ways that people can work together synchronously and asynchronously.
Communities being able to come together and connect with one another virtually.
Ideas and concepts being able move around the world.
communication platforms that are open, free, and lightly moderated
open source
Gives me information about when to do (e.g. swimming when I know that the tides are the right time).
Everyone can work on the same version of the plan - communication happens throughout the whole group.
cocreation of things like the vaccine (E.g. COVID)
access to knowledge
When you can't get people geographically coordinated how do you bring it together.
openness or open participation
being able to make and do things
people who typically aren't involved are welcomed.
Not necessarily a really "sexy" technology - just needs to work and be accessible.
it feels safe to share data/information
infinite agency to create within the landscape based on "addresses" (urls, email addreses, etc)
the people that keep it all running and innovating
able to meet new people
brings people together who wouldn't otherwise be able to connect.
being an owner of "my slice" of the universe
Known way to engage - we know how to be in connection.
GitHub
google docs
quarto
Removing barriers and obstructions, allowing intent and clarity to carry through to the end user
the threats that consolidated infrastructure present (privacy, overloads, governance, de-platforming)
something that goes hand in hand with agreed conventions of respect.
fiberoptic cables
DNS
HTTP
the Internet
Dashboards of cars - can think of dashboards as enabling and empowering.
the internet
http
Gopher
libraries
Open Source Structures / Licenses
things that help other people when they need it.
Idea exchanges
It's responsive to what the community needs are.
email address
video conferencing tools
levers for engagement and for exclusion
#!/usr/bin/env python
import requests
from dotenv import load_dotenv
import os
import random
import argparse
load_dotenv()
api_token = os.getenv('MIRO_TOKEN')
board = os.getenv("BOARD")
import math
def generate_circle_points(n, radius=800):
points = []
for i in range(n):
# Calculate the angle for this point
theta = 2 * math.pi * i / n
# Calculate x and y coordinates
x = radius * math.cos(theta)
y = radius * math.sin(theta)
# Append the (x, y) tuple to the list of points
points.append((x, y))
return tuple(points)
def post_to_miro(board_id, message, api_token=api_token, jitter=True, center=(100,100), color='light_yellow'):
url = f'https://api.miro.com/v2/boards/{board_id}/sticky_notes'
headers = {
'Authorization': f'Bearer {api_token}',
'Content-Type': 'application/json',
'Accept': 'application/json'
}
# Define the widget data you want to add to your Miro board
x,y = center
if jitter: # jitter points to avoid overplotting
x_offset = random.randint(-150,150)
y_offset = random.randint(-150,150)
else:
x_offset = 0
y_offset = 0
widget_data = {
"data": {"content": message},
"position": {
"x": x + x_offset,
"y": y + y_offset
},
"geometry":{
"width": 100
},
"style": { "fillColor": color}
}
response = requests.post(url, json=widget_data, headers=headers)
return response.json()
if __name__ == '__main__':
# Check for environment variable first
api_token = os.getenv('MIRO_TOKEN')
# Setup argparse
parser = argparse.ArgumentParser( description="""Used to post a message to a miro board, messages are posted in a
random location aronud the central coordinates of the board.
The API Token for Miro should be set as
MIRO_TOKEN or supplied as a commandline argument.""")
# Arguments
parser.add_argument("-k", "--key", dest="apikey", help="API key for the application, can also be set as environment variable MIRO_TOKEN", default=api_token)
parser.add_argument("-b", "--board", dest="board", help="Miro board ID", default=board)
parser.add_argument("-m", "--message", dest="message", help="Message for sticky note", required=True)
args = parser.parse_args()
# Use the API key from command line arguments if provided, otherwise from the environment
api_token = args.apikey
board_id = args.board
message= args.message
if not api_token:
parser.error("API key must be provided through -k option or API_KEY environment variable")
resp = post_to_miro(board_id, message, api_token=api_token)
print(resp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment