Skip to content

Instantly share code, notes, and snippets.

@dbreunig
Last active December 29, 2024 14:33
Show Gist options
  • Save dbreunig/3cef9293cb253f9192d5b4974c1367a3 to your computer and use it in GitHub Desktop.
Save dbreunig/3cef9293cb253f9192d5b4974c1367a3 to your computer and use it in GitHub Desktop.
A script to generate a glossary of key terms from your Jekyll posts. We're using DSPy to handle LLM interactions; it helps with boilerplate prompt context and parsing responses into Pydantic objects. To run this, put this script in a folder named 'scripts' (or whatever) in your Jekyll site directory. Then plug in your Anthropic API key (or point…
from pathlib import Path
import re
import dspy
from typing import TypedDict, List
import yaml
# Set up DSPy and the LM
lm = dspy.LM('anthropic/claude-3-5-haiku-latest', api_key='YOUR_API_KEY')
dspy.configure(lm=lm)
#
# Get the initial list of terms
#
# Define the Term object we want returned
class Term(BaseModel):
term: str = dspy.OutputField(desc="A glossary term, like: a technical term specific to the subject matter, a concept crucial to understanding an article's main ideas, a term explicitly defined or explained in a post, or a word or phrase that are frequently used or emphasized in the post. Do not include the abbreviation in the 'term' field.")
abbreviation: str = dspy.OutputField(desc="Populate the abbreviation field if the term is abbreviated in the article, ensure that it is not pluralized. If there is no abbreviation, populate the abbreviation field with an empty string.")
definition: str = dspy.OutputField(desc="A definition of the term. Lightly edit the definition so it can stand alone outside the context of the post, but ensure that you do not add any information that is not present in the original text.")
details: str = dspy.OutputField(desc="Text from the post that expounds a bit on the term, adding texture and details beyond the definition. The 'details' field can be empty if there is no additional context to provide and multiple paragraphs if there is more than one piece of context to provide.")
synonyms: List[str] = dspy.OutputField(desc="Any synonyms, acronyms, or alternative terms that are used in the post")
# Find key terms for the post and terms where their definition might not be clear to the reader
class ExtractTerms(dspy.Signature):
"""Find key terms for the post and terms where their definition might not be clear to the reader, from a markdown blog post. Ignore all text between markdown code blocks."""
post: str = dspy.InputField(desc="the markdown blog post")
terms: List[Term] = dspy.OutputField(desc="Array of glossary terms.")
extractTerms = dspy.Predict(ExtractTerms)
# Get the terms from the posts
posts_path = Path("../_posts")
glossary = []
for post_file in sorted(posts_path.glob('*.md')):
print(f"Processing {post_file}")
with open(post_file, 'r') as f:
post_content = f.read()
post_content = re.split(r'\n---\n', post_content, maxsplit=2)[-1]
try:
terms = extractTerms(post=post_content)
if 'glossary' not in globals():
glossary = []
except Exception as e:
print(f"Failed to process {post_file}: {e}")
continue
for term in terms.terms:
if term not in glossary:
if str(post_file).startswith('../'):
term['path'] = str(post_file)[3:]
else:
term['path'] = post_file
print(f"Adding term {term}")
glossary.append(term)
break
#
# Condense the glossary to unique terms
#
# Compare two term dicts to see if they are the same term
def compare_terms(term1, term2):
if term1['term'].lower() == term2['term'].lower():
return True
if any(syn.lower() in [s.lower() for s in term2['synonyms']] for syn in term1['synonyms']):
return True
if term1['term'].lower() in [s.lower() for s in term2['synonyms']]:
return True
return False
# Condense the glossary by finding identical terms and merging their definitions, details, and synonyns.
merged_glossary = {}
for term in glossary:
found = False
for key in merged_glossary:
if compare_terms(term, merged_glossary[key]):
found = True
merged_glossary[key]['details'] += "\n\n" + term['details']
merged_glossary[key]['synonyms'] += term['synonyms']
merged_glossary[key]['pages'].append(term['path'])
merged_glossary[key]['synonyms'] = list(set(merged_glossary[key]['synonyms']))
break
if not found:
page = term['path']
term['pages'] = [page]
del term['path']
merged_glossary[term['term']] = term
# Sort the merged_glossary by keys
sorted_glossary = dict(sorted(merged_glossary.items()))
# Create the _data directory if it doesn't exist
Path("../_data").mkdir(parents=True, exist_ok=True)
# Write the sorted glossary values to a YAML file
with open('../_data/glossary.yaml', 'w') as yaml_file:
yaml.dump(list(sorted_glossary.values()), yaml_file, default_flow_style=False, sort_keys=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment