jduckles/cluster.py Secret

## readme.md

      
    Raw
  

              readme.md
            
          
    A quick and dirty exploration of using LLMs and embeddings to classify and cluster text.
Tested during the Oblique Thinking Hour of Organizational Mycology on 2024-05-21.
Approach:

Take a vector (list) of text (responses, one per line).
Present each line to an “embedding” algorithm. This takes the text and places it in a multi-dimensional space. Related text (conceptually) tends to cluster with other text in this “hyper-space”
Collapse the hyperspace to a smaller dimensional space (Principal Component Analysis)
Cluster (simple method with k-means here today)
Feed the contents of the cluster (each item) to an LLM with the following prompt:


Give me a short description of what is common across the lines of text after the colon. You should respond with no more than a five word description of what would be a good summary categorical name for these items:

{out_items}

pip install scikit-learn requests llm sentence_transformers dotenv

To paste to Miro you'll need to create a .env file with
MIRO_TOKEN=<your miro token>

Run like:
python cluster.py items.txt <target miroboard id>


## cluster.py
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA  # Optional: For reducing dimensions
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Callable
import llm
import miro

def embed_items(items, embedder):
    """
    Embeds given text items into numerical vectors using the specified embedding model.
    """
    return embedder.encode(items)

def cluster_embeddings(embeddings, n_clusters):
    """
    Clusters embeddings into the specified number of categories using the given clustering algorithm.
    """
    return cluster_algo(n_clusters=n_clusters).fit_predict(embeddings)

def prep_output(items, labels):
    return [{"item": item, "category": str(label)} for item, label in zip(items, labels)]

def output_json(items, labels):
    """
    Creates a JSON string mapping items to their cluster labels.
    """
    return json.dumps(prep_output(items,labels), indent=2)

def group(data):
    from collections import defaultdict

    # Create a defaultdict to store the groups
    category_groups = defaultdict(list)

    # Iterate over each dictionary in the list
    for item in data:
    # Append the item to the list of its corresponding category
        category_groups[item['category']].append(item)

    # category_groups is now a dict with categories as keys and lists of items as values
    # Convert defaultdict to a regular dict for output if necessary
    category_groups = dict(category_groups)

    return category_groups

def summarize(items, post_miro=False):
    #print(items)
    out_items = "\n".join(items)
    model = llm.get_model("gpt-4")
    prompt_text = f"""Give me a short description of what is common across the lines of text after the colon.
                        You should respond with no more than a five word description of what would be a good summary categorical name for these items:
                        {out_items}"""
    response = model.prompt(prompt_text)
    #print(response.text())
    return(response.text())

def main(items: List[str], n_clusters: int):
    """
    Main function to run the program.
    """
    # Embedding options
    embedder = SentenceTransformer('all-mpnet-base-v2')

    # Embed items
    embeddings = embed_items(items, embedder)

    # Optional: Reduce dimensions for faster processing in clustering if embeddings are very large
    pca = PCA(n_components=25)  # Arbitrarily chosen component number
    embeddings = pca.fit_transform(embeddings)

    # Clustering options
    cluster_algo = KMeans  # Example clustering model, replace or modify as needed

    # Cluster embeddings
    labels = cluster_embeddings(embeddings, n_clusters, cluster_algo)

    categories = prep_output(items, labels)
    return categories


def to_miro(named_categories, board):

    circle_points = miro.generate_circle_points(len(named_categories))

    for index, (category, items) in enumerate(named_categories.items()):
        # Assuming post_to_miro function exists: posting category name
        # Assumption: category names are used as titles here instead of 'item.label'.
        miro.post_to_miro(board_id=board, message=category, center=circle_points[index], jitter=False, color="light_blue")

        # Looping through items within each category
        for item in items:
            item_name = item['item']  # get the 'item' field from the dictionary
            miro.post_to_miro(board_id=board, message=item_name, center=circle_points[index])


if __name__ == "__main__":
    import sys
    # Example data
    sample_items = []
    with open(sys.argv[1], 'r') as file:
        for line in file:
            sample_items.append(line.strip())
    number_of_clusters = 9  # Example number, adjust based on needs

    board = sys.argv[2]

    categorized = main(sample_items, number_of_clusters) # Numbered cluster categories

    grouped = group(categorized) # group as a dictionary by category

    named_categories = {} # LLM named categories
    to_summarize = []

    for key in grouped.keys():
        to_summarize = []
        for record in grouped[key]:
            to_summarize.append(record['item'])
        named_categories[summarize(to_summarize)] = grouped[key]

    to_miro(named_categories, board=board)
    print(json.dumps(named_categories, indent=2))

## items.txt
Cellular phone networks
technology that supports our modern way of life
Accessible Wifi, Access to AI tools, Digital Skills Training
JupyterHub
static website creation (quarto, hugo), everything framasoft is doing (mailing list, peertube, mobilizon, forums, mattermost instance,...) , cryptpad.fr, mastodon,
Conda Forge
Jupyter
easy to use, integrated into regular daily tasks/work (not an extra step)
connects people over common interests/needs without feeling forced
holds knowledge and transfers it to the next/new person
Software
Software infrastructure and website hosting infrastructure
People being able to communicate across distances and time zones.
Ways that people can work together synchronously and asynchronously.
Communities being able to come together and connect with one another virtually.
Ideas and concepts being able move around the world.
communication platforms that are open, free, and lightly moderated
open source
Gives me information about when to do (e.g. swimming when I know that the tides are the right time).
Everyone can work on the same version of the plan - communication happens throughout the whole group.
cocreation of things like the vaccine (E.g. COVID)
access to knowledge
When you can't get people geographically coordinated how do you bring it together.
openness or open participation
being able to make and do things
people who typically aren't involved are welcomed.
Not necessarily a really "sexy" technology - just needs to work and be accessible.
it feels safe to share data/information
infinite agency to create within the landscape based on "addresses" (urls, email addreses, etc)
the people that keep it all running and innovating
able to meet new people
brings people together who wouldn't otherwise be able to connect.
being an owner of "my slice" of the universe
Known way to engage - we know how to be in connection.
GitHub
google docs
quarto
Removing barriers and obstructions, allowing intent and clarity to carry through to the end user
the threats that consolidated infrastructure present (privacy, overloads, governance, de-platforming)
something that goes hand in hand with agreed conventions of respect.
fiberoptic cables
DNS
HTTP
the Internet
Dashboards of cars - can think of dashboards as enabling and empowering.
the internet
http
Gopher
libraries
Open Source Structures / Licenses
things that help other people when they need it.
Idea exchanges
It's responsive to what the community needs are.
email address
video conferencing tools
levers for engagement and for exclusion

## miro.py
#!/usr/bin/env python
import requests
from dotenv import load_dotenv
import os
import random
import argparse

load_dotenv()
api_token = os.getenv('MIRO_TOKEN')
board = os.getenv("BOARD")

import math

def generate_circle_points(n, radius=800):
    points = []

    for i in range(n):
        # Calculate the angle for this point
        theta = 2 * math.pi * i / n

        # Calculate x and y coordinates
        x = radius * math.cos(theta)
        y = radius * math.sin(theta)

        # Append the (x, y) tuple to the list of points
        points.append((x, y))

    return tuple(points)


def post_to_miro(board_id, message, api_token=api_token, jitter=True, center=(100,100), color='light_yellow'):
    url = f'https://api.miro.com/v2/boards/{board_id}/sticky_notes'
    headers = {
        'Authorization': f'Bearer {api_token}',
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
    # Define the widget data you want to add to your Miro board
    x,y = center

    if jitter: # jitter points to avoid overplotting
        x_offset = random.randint(-150,150)
        y_offset = random.randint(-150,150)
    else:
        x_offset = 0
        y_offset = 0


    widget_data = {
        "data": {"content": message},
        "position": {
        "x": x + x_offset,
        "y": y + y_offset
        },
        "geometry":{
        "width": 100
        },
        "style": { "fillColor": color}
    }
    response = requests.post(url, json=widget_data, headers=headers)
    return response.json()

if __name__ == '__main__':
    # Check for environment variable first
    api_token = os.getenv('MIRO_TOKEN')

    # Setup argparse
    parser = argparse.ArgumentParser( description="""Used to post a message to a miro board, messages are posted in a
                                    random location aronud the central coordinates of the board.
                                    The API Token for Miro should be set as
                                    MIRO_TOKEN or supplied as a commandline argument.""")
    # Arguments
    parser.add_argument("-k", "--key", dest="apikey", help="API key for the application, can also be set as environment variable MIRO_TOKEN", default=api_token)
    parser.add_argument("-b", "--board", dest="board", help="Miro board ID", default=board)
    parser.add_argument("-m", "--message", dest="message", help="Message for sticky note", required=True)

    args = parser.parse_args()

    # Use the API key from command line arguments if provided, otherwise from the environment
    api_token = args.apikey
    board_id = args.board
    message= args.message

    if not api_token:
        parser.error("API key must be provided through -k option or API_KEY environment variable")

    resp = post_to_miro(board_id, message, api_token=api_token)
    print(resp)
	def warn(args, *kwargs):
	pass
	import warnings
	warnings.warn = warn

	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	import json
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA # Optional: For reducing dimensions
	from sentence_transformers import SentenceTransformer
	from typing import List, Dict, Any, Callable
	import llm
	import miro

	def embed_items(items, embedder):
	"""
	Embeds given text items into numerical vectors using the specified embedding model.
	"""
	return embedder.encode(items)

	def cluster_embeddings(embeddings, n_clusters):
	"""
	Clusters embeddings into the specified number of categories using the given clustering algorithm.
	"""
	return cluster_algo(n_clusters=n_clusters).fit_predict(embeddings)

	def prep_output(items, labels):
	return [{"item": item, "category": str(label)} for item, label in zip(items, labels)]

	def output_json(items, labels):
	"""
	Creates a JSON string mapping items to their cluster labels.
	"""
	return json.dumps(prep_output(items,labels), indent=2)

	def group(data):
	from collections import defaultdict

	# Create a defaultdict to store the groups
	category_groups = defaultdict(list)

	# Iterate over each dictionary in the list
	for item in data:
	# Append the item to the list of its corresponding category
	category_groups[item['category']].append(item)

	# category_groups is now a dict with categories as keys and lists of items as values
	# Convert defaultdict to a regular dict for output if necessary
	category_groups = dict(category_groups)

	return category_groups

	def summarize(items, post_miro=False):
	#print(items)
	out_items = "\n".join(items)
	model = llm.get_model("gpt-4")
	prompt_text = f"""Give me a short description of what is common across the lines of text after the colon.
	You should respond with no more than a five word description of what would be a good summary categorical name for these items:
	{out_items}"""
	response = model.prompt(prompt_text)
	#print(response.text())
	return(response.text())

	def main(items: List[str], n_clusters: int):
	"""
	Main function to run the program.
	"""
	# Embedding options
	embedder = SentenceTransformer('all-mpnet-base-v2')

	# Embed items
	embeddings = embed_items(items, embedder)

	# Optional: Reduce dimensions for faster processing in clustering if embeddings are very large
	pca = PCA(n_components=25) # Arbitrarily chosen component number
	embeddings = pca.fit_transform(embeddings)

	# Clustering options
	cluster_algo = KMeans # Example clustering model, replace or modify as needed

	# Cluster embeddings
	labels = cluster_embeddings(embeddings, n_clusters, cluster_algo)

	categories = prep_output(items, labels)
	return categories


	def to_miro(named_categories, board):

	circle_points = miro.generate_circle_points(len(named_categories))

	for index, (category, items) in enumerate(named_categories.items()):
	# Assuming post_to_miro function exists: posting category name
	# Assumption: category names are used as titles here instead of 'item.label'.
	miro.post_to_miro(board_id=board, message=category, center=circle_points[index], jitter=False, color="light_blue")

	# Looping through items within each category
	for item in items:
	item_name = item['item'] # get the 'item' field from the dictionary
	miro.post_to_miro(board_id=board, message=item_name, center=circle_points[index])


	if __name__ == "__main__":
	import sys
	# Example data
	sample_items = []
	with open(sys.argv[1], 'r') as file:
	for line in file:
	sample_items.append(line.strip())
	number_of_clusters = 9 # Example number, adjust based on needs

	board = sys.argv[2]

	categorized = main(sample_items, number_of_clusters) # Numbered cluster categories

	grouped = group(categorized) # group as a dictionary by category

	named_categories = {} # LLM named categories
	to_summarize = []

	for key in grouped.keys():
	to_summarize = []
	for record in grouped[key]:
	to_summarize.append(record['item'])
	named_categories[summarize(to_summarize)] = grouped[key]

	to_miro(named_categories, board=board)
	print(json.dumps(named_categories, indent=2))
	Cellular phone networks
	technology that supports our modern way of life
	Accessible Wifi, Access to AI tools, Digital Skills Training
	JupyterHub
	static website creation (quarto, hugo), everything framasoft is doing (mailing list, peertube, mobilizon, forums, mattermost instance,...) , cryptpad.fr, mastodon,
	Conda Forge
	Jupyter
	easy to use, integrated into regular daily tasks/work (not an extra step)
	connects people over common interests/needs without feeling forced
	holds knowledge and transfers it to the next/new person
	Software
	Software infrastructure and website hosting infrastructure
	People being able to communicate across distances and time zones.
	Ways that people can work together synchronously and asynchronously.
	Communities being able to come together and connect with one another virtually.
	Ideas and concepts being able move around the world.
	communication platforms that are open, free, and lightly moderated
	open source
	Gives me information about when to do (e.g. swimming when I know that the tides are the right time).
	Everyone can work on the same version of the plan - communication happens throughout the whole group.
	cocreation of things like the vaccine (E.g. COVID)
	access to knowledge
	When you can't get people geographically coordinated how do you bring it together.
	openness or open participation
	being able to make and do things
	people who typically aren't involved are welcomed.
	Not necessarily a really "sexy" technology - just needs to work and be accessible.
	it feels safe to share data/information
	infinite agency to create within the landscape based on "addresses" (urls, email addreses, etc)
	the people that keep it all running and innovating
	able to meet new people
	brings people together who wouldn't otherwise be able to connect.
	being an owner of "my slice" of the universe
	Known way to engage - we know how to be in connection.
	GitHub
	google docs
	quarto
	Removing barriers and obstructions, allowing intent and clarity to carry through to the end user
	the threats that consolidated infrastructure present (privacy, overloads, governance, de-platforming)
	something that goes hand in hand with agreed conventions of respect.
	fiberoptic cables
	DNS
	HTTP
	the Internet
	Dashboards of cars - can think of dashboards as enabling and empowering.
	the internet
	http
	Gopher
	libraries
	Open Source Structures / Licenses
	things that help other people when they need it.
	Idea exchanges
	It's responsive to what the community needs are.
	email address
	video conferencing tools
	levers for engagement and for exclusion
	#!/usr/bin/env python
	import requests
	from dotenv import load_dotenv
	import os
	import random
	import argparse

	load_dotenv()
	api_token = os.getenv('MIRO_TOKEN')
	board = os.getenv("BOARD")

	import math

	def generate_circle_points(n, radius=800):
	points = []

	for i in range(n):
	# Calculate the angle for this point
	theta = 2 * math.pi * i / n

	# Calculate x and y coordinates
	x = radius * math.cos(theta)
	y = radius * math.sin(theta)

	# Append the (x, y) tuple to the list of points
	points.append((x, y))

	return tuple(points)


	def post_to_miro(board_id, message, api_token=api_token, jitter=True, center=(100,100), color='light_yellow'):
	url = f'https://api.miro.com/v2/boards/{board_id}/sticky_notes'
	headers = {
	'Authorization': f'Bearer {api_token}',
	'Content-Type': 'application/json',
	'Accept': 'application/json'
	}
	# Define the widget data you want to add to your Miro board
	x,y = center

	if jitter: # jitter points to avoid overplotting
	x_offset = random.randint(-150,150)
	y_offset = random.randint(-150,150)
	else:
	x_offset = 0
	y_offset = 0


	widget_data = {
	"data": {"content": message},
	"position": {
	"x": x + x_offset,
	"y": y + y_offset
	},
	"geometry":{
	"width": 100
	},
	"style": { "fillColor": color}
	}
	response = requests.post(url, json=widget_data, headers=headers)
	return response.json()

	if __name__ == '__main__':
	# Check for environment variable first
	api_token = os.getenv('MIRO_TOKEN')

	# Setup argparse
	parser = argparse.ArgumentParser( description="""Used to post a message to a miro board, messages are posted in a
	random location aronud the central coordinates of the board.
	The API Token for Miro should be set as
	MIRO_TOKEN or supplied as a commandline argument.""")
	# Arguments
	parser.add_argument("-k", "--key", dest="apikey", help="API key for the application, can also be set as environment variable MIRO_TOKEN", default=api_token)
	parser.add_argument("-b", "--board", dest="board", help="Miro board ID", default=board)
	parser.add_argument("-m", "--message", dest="message", help="Message for sticky note", required=True)

	args = parser.parse_args()

	# Use the API key from command line arguments if provided, otherwise from the environment
	api_token = args.apikey
	board_id = args.board
	message= args.message

	if not api_token:
	parser.error("API key must be provided through -k option or API_KEY environment variable")

	resp = post_to_miro(board_id, message, api_token=api_token)
	print(resp)