Skip to content

Instantly share code, notes, and snippets.

@lordjabez
Last active November 12, 2023 17:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lordjabez/e12e3d7d01879833246ae7c6253177a1 to your computer and use it in GitHub Desktop.
Save lordjabez/e12e3d7d01879833246ae7c6253177a1 to your computer and use it in GitHub Desktop.
Create theme reports from a Wordpress blog export
#!/usr/bin/env python3
import contextlib
import os
import sys
import cachier
import feedparser
import keyring
import openai
import numpy as np
import pandas as pd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
try:
blog_domain = sys.argv[1]
num_themes = [int(t) for t in sys.argv[2].split(',')]
except Exception:
print('USAGE: generate-theme-reports.py BLOG_EXPORT_FILE NUM_THEMES[,NUM_THEMES...]')
sys.exit(-1)
samples_per_cluster = 10
random_state = 42
summarization_prompt = 'Summarize the theme of these posts in a single sentence. Do not include topic examples.'
openai_api_key = keyring.get_password('system', 'openai')
openai_client = openai.Client(api_key=openai_api_key)
@cachier.cachier()
def create_embedding(text):
print(text)
response = openai_client.embeddings.create(model='text-embedding-ada-002', input=text)
return response.data[0].embedding
@cachier.cachier()
def create_completion(prompt):
messages = [{'role': 'user', 'content': prompt}]
response = openai_client.chat.completions.create(
model='gpt-4-1106-preview',
messages=messages,
temperature=0,
max_tokens=100,
)
return response.choices[0].message.content.replace('\n', ' ')
def create_post(entry):
post_id = entry['id']
link = entry['link']
title = entry['title']
content = entry['content'][0]['value']
return {'id': post_id, 'link': link, 'title': title, 'content': content}
@cachier.cachier()
def download_posts(blog_domain):
print(f'Downloading posts from {blog_domain}')
posts = []
page = 1
while True:
feed_url = f'https://{blog_domain}?feed=rss2&paged={page}'
entries = feedparser.parse(feed_url)['entries']
if not entries:
break
posts.extend(create_post(e) for e in entries)
page += 1
return pd.DataFrame(posts)
def compute_embeddings(posts):
print('Computing embeddings for posts')
posts['embedding'] = posts.apply(lambda r: create_embedding(r.content), axis=1)
def generate_clusters(posts, num_clusters):
print(f'Computing {num_clusters} clusters from post embeddings')
post_vectors = np.vstack(posts.embedding.values)
kmeans = KMeans(n_init='auto', n_clusters=num_clusters, init='k-means++', random_state=random_state)
kmeans.fit(post_vectors)
posts['cluster_index'] = kmeans.labels_
tsne = TSNE(n_components=2, perplexity=15, learning_rate=200, init='random', random_state=random_state)
return tsne.fit_transform(post_vectors)
def get_cluster_theme(posts, cluster_index):
print(f'Determining the theme of cluster {cluster_index}')
cluster = posts[posts.cluster_index == cluster_index]
num_samples = min(cluster.shape[0], samples_per_cluster)
cluster_samples = cluster.sample(num_samples, random_state=random_state)
posts = '\n'.join(cluster_samples.content.values)
prompt = '\n\n'.join((summarization_prompt, 'Posts:', '###', posts, '###', 'Theme:'))
return cluster_samples, create_completion(prompt)
def get_cluster_themes(posts):
num_themes = posts.cluster_index.max() + 1
return [(i, *get_cluster_theme(posts, i)) for i in range(num_themes)]
def create_theme_report(cluster_themes, report_filename):
print(f'Writing theme report to {report_filename}')
color_names = [c.replace('tab:', '').capitalize() for c in mcolors.TABLEAU_COLORS]
with open(report_filename, 'w') as report_file, contextlib.redirect_stdout(report_file):
print('<h1>Topic Distribution</h1>')
print('<img src="clustering.png"/>')
for cluster_index, cluster_samples, theme in cluster_themes:
print(f'<h1>{color_names[cluster_index]} Posts</h1>')
print(f'<p>{theme}</p>')
print('<ul>')
for s in range(len(cluster_samples)):
link = cluster_samples.link.values[s]
title = cluster_samples.title.values[s]
print(f'<li><a href="{link}">{title}</a></li>')
print('</ul>')
def create_theme_graph(posts, post_vectors, graph_filename):
print(f'Writing theme graph to {graph_filename}')
num_themes = posts.cluster_index.max() + 1
x_values, y_values = post_vectors.T
plt.figure()
for cluster_index, color in zip(range(num_themes), mcolors.TABLEAU_COLORS):
x_points = x_values[posts.cluster_index == cluster_index]
y_points = y_values[posts.cluster_index == cluster_index]
x_mean = x_points.mean()
y_mean = y_points.mean()
plt.scatter(x_points, y_points, color=color, alpha=0.3)
plt.scatter(x_mean, y_mean, marker='x', color=color, s=100)
plt.savefig(graph_filename)
def create_report(posts, post_vectors, cluster_themes):
num_themes = posts.cluster_index.max() + 1
report_folder = os.path.join('reports', f'{num_themes}-themes')
print(f'Creating report at {report_folder}')
os.makedirs(report_folder, exist_ok=True)
report_filename = os.path.join(report_folder, 'index.html')
graph_filename = os.path.join(report_folder, 'clustering.png')
create_theme_report(cluster_themes, report_filename)
create_theme_graph(posts, post_vectors, graph_filename)
posts = download_posts(blog_domain)
compute_embeddings(posts)
for num_themes in num_themes:
post_vectors = generate_clusters(posts, num_themes)
cluster_themes = get_cluster_themes(posts)
create_report(posts, post_vectors, cluster_themes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment