lordjabez/generate-theme-reports.py

## generate-theme-reports.py
#!/usr/bin/env python3


import contextlib
import os
import sys

import cachier
import feedparser
import keyring
import openai

import numpy as np
import pandas as pd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


try:
    blog_domain = sys.argv[1]
    num_themes = [int(t) for t in sys.argv[2].split(',')]
except Exception:
    print('USAGE: generate-theme-reports.py BLOG_EXPORT_FILE NUM_THEMES[,NUM_THEMES...]')
    sys.exit(-1)


samples_per_cluster = 10
random_state = 42
summarization_prompt = 'Summarize the theme of these posts in a single sentence. Do not include topic examples.'


openai_api_key = keyring.get_password('system', 'openai')
openai_client = openai.Client(api_key=openai_api_key)


@cachier.cachier()
def create_embedding(text):
    print(text)
    response = openai_client.embeddings.create(model='text-embedding-ada-002', input=text)
    return response.data[0].embedding


@cachier.cachier()
def create_completion(prompt):
    messages = [{'role': 'user', 'content': prompt}]
    response = openai_client.chat.completions.create(
        model='gpt-4-1106-preview',
        messages=messages,
        temperature=0,
        max_tokens=100,
    )
    return response.choices[0].message.content.replace('\n', ' ')


def create_post(entry):
    post_id = entry['id']
    link = entry['link']
    title = entry['title']
    content = entry['content'][0]['value']
    return {'id': post_id, 'link': link, 'title': title, 'content': content}


@cachier.cachier()
def download_posts(blog_domain):
    print(f'Downloading posts from {blog_domain}')
    posts = []
    page = 1
    while True:
        feed_url = f'https://{blog_domain}?feed=rss2&paged={page}'
        entries = feedparser.parse(feed_url)['entries']
        if not entries:
            break
        posts.extend(create_post(e) for e in entries)
        page += 1
    return pd.DataFrame(posts)


def compute_embeddings(posts):
    print('Computing embeddings for posts')
    posts['embedding'] = posts.apply(lambda r: create_embedding(r.content), axis=1)


def generate_clusters(posts, num_clusters):
    print(f'Computing {num_clusters} clusters from post embeddings')
    post_vectors = np.vstack(posts.embedding.values)
    kmeans = KMeans(n_init='auto', n_clusters=num_clusters, init='k-means++', random_state=random_state)
    kmeans.fit(post_vectors)
    posts['cluster_index'] = kmeans.labels_
    tsne = TSNE(n_components=2, perplexity=15, learning_rate=200, init='random', random_state=random_state)
    return tsne.fit_transform(post_vectors)


def get_cluster_theme(posts, cluster_index):
    print(f'Determining the theme of cluster {cluster_index}')
    cluster = posts[posts.cluster_index == cluster_index]
    num_samples = min(cluster.shape[0], samples_per_cluster)
    cluster_samples = cluster.sample(num_samples, random_state=random_state)
    posts = '\n'.join(cluster_samples.content.values)
    prompt = '\n\n'.join((summarization_prompt, 'Posts:', '###', posts, '###', 'Theme:'))
    return cluster_samples, create_completion(prompt)


def get_cluster_themes(posts):
    num_themes = posts.cluster_index.max() + 1
    return [(i, *get_cluster_theme(posts, i)) for i in range(num_themes)]


def create_theme_report(cluster_themes, report_filename):
    print(f'Writing theme report to {report_filename}')
    color_names = [c.replace('tab:', '').capitalize() for c in mcolors.TABLEAU_COLORS]
    with open(report_filename, 'w') as report_file, contextlib.redirect_stdout(report_file):
        print('<h1>Topic Distribution</h1>')
        print('<img src="clustering.png"/>')
        for cluster_index, cluster_samples, theme in cluster_themes:
            print(f'<h1>{color_names[cluster_index]} Posts</h1>')
            print(f'<p>{theme}</p>')
            print('<ul>')
            for s in range(len(cluster_samples)):
                link = cluster_samples.link.values[s]
                title = cluster_samples.title.values[s]
                print(f'<li><a href="{link}">{title}</a></li>')
            print('</ul>')


def create_theme_graph(posts, post_vectors, graph_filename):
    print(f'Writing theme graph to {graph_filename}')
    num_themes = posts.cluster_index.max() + 1
    x_values, y_values = post_vectors.T
    plt.figure()
    for cluster_index, color in zip(range(num_themes), mcolors.TABLEAU_COLORS):
        x_points = x_values[posts.cluster_index == cluster_index]
        y_points = y_values[posts.cluster_index == cluster_index]
        x_mean = x_points.mean()
        y_mean = y_points.mean()
        plt.scatter(x_points, y_points, color=color, alpha=0.3)
        plt.scatter(x_mean, y_mean, marker='x', color=color, s=100)
    plt.savefig(graph_filename)


def create_report(posts, post_vectors, cluster_themes):
    num_themes = posts.cluster_index.max() + 1
    report_folder = os.path.join('reports', f'{num_themes}-themes')
    print(f'Creating report at {report_folder}')
    os.makedirs(report_folder, exist_ok=True)
    report_filename = os.path.join(report_folder, 'index.html')
    graph_filename = os.path.join(report_folder, 'clustering.png')
    create_theme_report(cluster_themes, report_filename)
    create_theme_graph(posts, post_vectors, graph_filename)


posts = download_posts(blog_domain)
compute_embeddings(posts)

for num_themes in num_themes:
    post_vectors = generate_clusters(posts, num_themes)
    cluster_themes = get_cluster_themes(posts)
    create_report(posts, post_vectors, cluster_themes)
	#!/usr/bin/env python3


	import contextlib
	import os
	import sys

	import cachier
	import feedparser
	import keyring
	import openai

	import numpy as np
	import pandas as pd
	import matplotlib.colors as mcolors
	import matplotlib.pyplot as plt
	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE


	try:
	blog_domain = sys.argv[1]
	num_themes = [int(t) for t in sys.argv[2].split(',')]
	except Exception:
	print('USAGE: generate-theme-reports.py BLOG_EXPORT_FILE NUM_THEMES[,NUM_THEMES...]')
	sys.exit(-1)


	samples_per_cluster = 10
	random_state = 42
	summarization_prompt = 'Summarize the theme of these posts in a single sentence. Do not include topic examples.'


	openai_api_key = keyring.get_password('system', 'openai')
	openai_client = openai.Client(api_key=openai_api_key)


	@cachier.cachier()
	def create_embedding(text):
	print(text)
	response = openai_client.embeddings.create(model='text-embedding-ada-002', input=text)
	return response.data[0].embedding


	@cachier.cachier()
	def create_completion(prompt):
	messages = [{'role': 'user', 'content': prompt}]
	response = openai_client.chat.completions.create(
	model='gpt-4-1106-preview',
	messages=messages,
	temperature=0,
	max_tokens=100,
	)
	return response.choices[0].message.content.replace('\n', ' ')


	def create_post(entry):
	post_id = entry['id']
	link = entry['link']
	title = entry['title']
	content = entry['content'][0]['value']
	return {'id': post_id, 'link': link, 'title': title, 'content': content}


	@cachier.cachier()
	def download_posts(blog_domain):
	print(f'Downloading posts from {blog_domain}')
	posts = []
	page = 1
	while True:
	feed_url = f'https://{blog_domain}?feed=rss2&paged={page}'
	entries = feedparser.parse(feed_url)['entries']
	if not entries:
	break
	posts.extend(create_post(e) for e in entries)
	page += 1
	return pd.DataFrame(posts)


	def compute_embeddings(posts):
	print('Computing embeddings for posts')
	posts['embedding'] = posts.apply(lambda r: create_embedding(r.content), axis=1)


	def generate_clusters(posts, num_clusters):
	print(f'Computing {num_clusters} clusters from post embeddings')
	post_vectors = np.vstack(posts.embedding.values)
	kmeans = KMeans(n_init='auto', n_clusters=num_clusters, init='k-means++', random_state=random_state)
	kmeans.fit(post_vectors)
	posts['cluster_index'] = kmeans.labels_
	tsne = TSNE(n_components=2, perplexity=15, learning_rate=200, init='random', random_state=random_state)
	return tsne.fit_transform(post_vectors)


	def get_cluster_theme(posts, cluster_index):
	print(f'Determining the theme of cluster {cluster_index}')
	cluster = posts[posts.cluster_index == cluster_index]
	num_samples = min(cluster.shape[0], samples_per_cluster)
	cluster_samples = cluster.sample(num_samples, random_state=random_state)
	posts = '\n'.join(cluster_samples.content.values)
	prompt = '\n\n'.join((summarization_prompt, 'Posts:', '###', posts, '###', 'Theme:'))
	return cluster_samples, create_completion(prompt)


	def get_cluster_themes(posts):
	num_themes = posts.cluster_index.max() + 1
	return [(i, *get_cluster_theme(posts, i)) for i in range(num_themes)]


	def create_theme_report(cluster_themes, report_filename):
	print(f'Writing theme report to {report_filename}')
	color_names = [c.replace('tab:', '').capitalize() for c in mcolors.TABLEAU_COLORS]
	with open(report_filename, 'w') as report_file, contextlib.redirect_stdout(report_file):
	print('<h1>Topic Distribution</h1>')
	print('<img src="clustering.png"/>')
	for cluster_index, cluster_samples, theme in cluster_themes:
	print(f'<h1>{color_names[cluster_index]} Posts</h1>')
	print(f'<p>{theme}</p>')
	print('<ul>')
	for s in range(len(cluster_samples)):
	link = cluster_samples.link.values[s]
	title = cluster_samples.title.values[s]
	print(f'<li><a href="{link}">{title}</a></li>')
	print('</ul>')


	def create_theme_graph(posts, post_vectors, graph_filename):
	print(f'Writing theme graph to {graph_filename}')
	num_themes = posts.cluster_index.max() + 1
	x_values, y_values = post_vectors.T
	plt.figure()
	for cluster_index, color in zip(range(num_themes), mcolors.TABLEAU_COLORS):
	x_points = x_values[posts.cluster_index == cluster_index]
	y_points = y_values[posts.cluster_index == cluster_index]
	x_mean = x_points.mean()
	y_mean = y_points.mean()
	plt.scatter(x_points, y_points, color=color, alpha=0.3)
	plt.scatter(x_mean, y_mean, marker='x', color=color, s=100)
	plt.savefig(graph_filename)


	def create_report(posts, post_vectors, cluster_themes):
	num_themes = posts.cluster_index.max() + 1
	report_folder = os.path.join('reports', f'{num_themes}-themes')
	print(f'Creating report at {report_folder}')
	os.makedirs(report_folder, exist_ok=True)
	report_filename = os.path.join(report_folder, 'index.html')
	graph_filename = os.path.join(report_folder, 'clustering.png')
	create_theme_report(cluster_themes, report_filename)
	create_theme_graph(posts, post_vectors, graph_filename)


	posts = download_posts(blog_domain)
	compute_embeddings(posts)

	for num_themes in num_themes:
	post_vectors = generate_clusters(posts, num_themes)
	cluster_themes = get_cluster_themes(posts)
	create_report(posts, post_vectors, cluster_themes)