Last active
December 29, 2024 14:33
-
-
Save dbreunig/3cef9293cb253f9192d5b4974c1367a3 to your computer and use it in GitHub Desktop.
A script to generate a glossary of key terms from your Jekyll posts. We're using DSPy to handle LLM interactions; it helps with boilerplate prompt context and parsing responses into Pydantic objects. To run this, put this script in a folder named 'scripts' (or whatever) in your Jekyll site directory. Then plug in your Anthropic API key (or point…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import re | |
import dspy | |
from typing import TypedDict, List | |
import yaml | |
# Set up DSPy and the LM | |
lm = dspy.LM('anthropic/claude-3-5-haiku-latest', api_key='YOUR_API_KEY') | |
dspy.configure(lm=lm) | |
# | |
# Get the initial list of terms | |
# | |
# Define the Term object we want returned | |
class Term(BaseModel): | |
term: str = dspy.OutputField(desc="A glossary term, like: a technical term specific to the subject matter, a concept crucial to understanding an article's main ideas, a term explicitly defined or explained in a post, or a word or phrase that are frequently used or emphasized in the post. Do not include the abbreviation in the 'term' field.") | |
abbreviation: str = dspy.OutputField(desc="Populate the abbreviation field if the term is abbreviated in the article, ensure that it is not pluralized. If there is no abbreviation, populate the abbreviation field with an empty string.") | |
definition: str = dspy.OutputField(desc="A definition of the term. Lightly edit the definition so it can stand alone outside the context of the post, but ensure that you do not add any information that is not present in the original text.") | |
details: str = dspy.OutputField(desc="Text from the post that expounds a bit on the term, adding texture and details beyond the definition. The 'details' field can be empty if there is no additional context to provide and multiple paragraphs if there is more than one piece of context to provide.") | |
synonyms: List[str] = dspy.OutputField(desc="Any synonyms, acronyms, or alternative terms that are used in the post") | |
# Find key terms for the post and terms where their definition might not be clear to the reader | |
class ExtractTerms(dspy.Signature): | |
"""Find key terms for the post and terms where their definition might not be clear to the reader, from a markdown blog post. Ignore all text between markdown code blocks.""" | |
post: str = dspy.InputField(desc="the markdown blog post") | |
terms: List[Term] = dspy.OutputField(desc="Array of glossary terms.") | |
extractTerms = dspy.Predict(ExtractTerms) | |
# Get the terms from the posts | |
posts_path = Path("../_posts") | |
glossary = [] | |
for post_file in sorted(posts_path.glob('*.md')): | |
print(f"Processing {post_file}") | |
with open(post_file, 'r') as f: | |
post_content = f.read() | |
post_content = re.split(r'\n---\n', post_content, maxsplit=2)[-1] | |
try: | |
terms = extractTerms(post=post_content) | |
if 'glossary' not in globals(): | |
glossary = [] | |
except Exception as e: | |
print(f"Failed to process {post_file}: {e}") | |
continue | |
for term in terms.terms: | |
if term not in glossary: | |
if str(post_file).startswith('../'): | |
term['path'] = str(post_file)[3:] | |
else: | |
term['path'] = post_file | |
print(f"Adding term {term}") | |
glossary.append(term) | |
break | |
# | |
# Condense the glossary to unique terms | |
# | |
# Compare two term dicts to see if they are the same term | |
def compare_terms(term1, term2): | |
if term1['term'].lower() == term2['term'].lower(): | |
return True | |
if any(syn.lower() in [s.lower() for s in term2['synonyms']] for syn in term1['synonyms']): | |
return True | |
if term1['term'].lower() in [s.lower() for s in term2['synonyms']]: | |
return True | |
return False | |
# Condense the glossary by finding identical terms and merging their definitions, details, and synonyns. | |
merged_glossary = {} | |
for term in glossary: | |
found = False | |
for key in merged_glossary: | |
if compare_terms(term, merged_glossary[key]): | |
found = True | |
merged_glossary[key]['details'] += "\n\n" + term['details'] | |
merged_glossary[key]['synonyms'] += term['synonyms'] | |
merged_glossary[key]['pages'].append(term['path']) | |
merged_glossary[key]['synonyms'] = list(set(merged_glossary[key]['synonyms'])) | |
break | |
if not found: | |
page = term['path'] | |
term['pages'] = [page] | |
del term['path'] | |
merged_glossary[term['term']] = term | |
# Sort the merged_glossary by keys | |
sorted_glossary = dict(sorted(merged_glossary.items())) | |
# Create the _data directory if it doesn't exist | |
Path("../_data").mkdir(parents=True, exist_ok=True) | |
# Write the sorted glossary values to a YAML file | |
with open('../_data/glossary.yaml', 'w') as yaml_file: | |
yaml.dump(list(sorted_glossary.values()), yaml_file, default_flow_style=False, sort_keys=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment