dbreunig/jekyll_glossary_generator.py

## jekyll_glossary_generator.py
from pathlib import Path
import re
import dspy
from typing import TypedDict, List
import yaml

# Set up DSPy and the LM
lm = dspy.LM('anthropic/claude-3-5-haiku-latest', api_key='YOUR_API_KEY')
dspy.configure(lm=lm)

#
# Get the initial list of terms
#

# Define the Term object we want returned
class Term(BaseModel):
    term: str = dspy.OutputField(desc="A glossary term, like: a technical term specific to the subject matter, a concept crucial to understanding an article's main ideas, a term explicitly defined or explained in a post, or a word or phrase that are frequently used or emphasized in the post. Do not include the abbreviation in the 'term' field.")
    abbreviation: str = dspy.OutputField(desc="Populate the abbreviation field if the term is abbreviated in the article, ensure that it is not pluralized. If there is no abbreviation, populate the abbreviation field with an empty string.")
    definition: str = dspy.OutputField(desc="A definition of the term. Lightly edit the definition so it can stand alone outside the context of the post, but ensure that you do not add any information that is not present in the original text.")
    details: str = dspy.OutputField(desc="Text from the post that expounds a bit on the term, adding texture and details beyond the definition. The 'details' field can be empty if there is no additional context to provide and multiple paragraphs if there is more than one piece of context to provide.")
    synonyms: List[str] = dspy.OutputField(desc="Any synonyms, acronyms, or alternative terms that are used in the post")

# Find key terms for the post and terms where their definition might not be clear to the reader
class ExtractTerms(dspy.Signature):
    """Find key terms for the post and terms where their definition might not be clear to the reader, from a markdown blog post. Ignore all text between markdown code blocks."""

    post: str = dspy.InputField(desc="the markdown blog post")
    terms: List[Term] = dspy.OutputField(desc="Array of glossary terms.")

extractTerms = dspy.Predict(ExtractTerms)

# Get the terms from the posts
posts_path = Path("../_posts")
glossary = []
for post_file in sorted(posts_path.glob('*.md')):
    print(f"Processing {post_file}")
    with open(post_file, 'r') as f:
        post_content = f.read()
        post_content = re.split(r'\n---\n', post_content, maxsplit=2)[-1]
        try:
            terms = extractTerms(post=post_content)
            if 'glossary' not in globals():
                glossary = []
        except Exception as e:
            print(f"Failed to process {post_file}: {e}")
            continue
        for term in terms.terms:
            if term not in glossary:
                if str(post_file).startswith('../'):
                    term['path'] = str(post_file)[3:]
                else:
                    term['path'] = post_file
                print(f"Adding term {term}")
                glossary.append(term)
        break

#
# Condense the glossary to unique terms
#

# Compare two term dicts to see if they are the same term
def compare_terms(term1, term2):
    if term1['term'].lower() == term2['term'].lower():
        return True
    if any(syn.lower() in [s.lower() for s in term2['synonyms']] for syn in term1['synonyms']):
        return True
    if term1['term'].lower() in [s.lower() for s in term2['synonyms']]:
        return True

    return False

# Condense the glossary by finding identical terms and merging their definitions, details, and synonyns.
merged_glossary = {}
for term in glossary:
    found = False
    for key in merged_glossary:
        if compare_terms(term, merged_glossary[key]):
            found = True
            merged_glossary[key]['details'] += "\n\n" + term['details']
            merged_glossary[key]['synonyms'] += term['synonyms']
            merged_glossary[key]['pages'].append(term['path'])
            merged_glossary[key]['synonyms'] = list(set(merged_glossary[key]['synonyms']))
            break
    if not found:
        page = term['path']
        term['pages'] = [page]
        del term['path']
        merged_glossary[term['term']] = term

# Sort the merged_glossary by keys
sorted_glossary = dict(sorted(merged_glossary.items()))

# Create the _data directory if it doesn't exist
Path("../_data").mkdir(parents=True, exist_ok=True)

# Write the sorted glossary values to a YAML file
with open('../_data/glossary.yaml', 'w') as yaml_file:
    yaml.dump(list(sorted_glossary.values()), yaml_file, default_flow_style=False, sort_keys=False)
	from pathlib import Path
	import re
	import dspy
	from typing import TypedDict, List
	import yaml

	# Set up DSPy and the LM
	lm = dspy.LM('anthropic/claude-3-5-haiku-latest', api_key='YOUR_API_KEY')
	dspy.configure(lm=lm)

	#
	# Get the initial list of terms
	#

	# Define the Term object we want returned
	class Term(BaseModel):
	term: str = dspy.OutputField(desc="A glossary term, like: a technical term specific to the subject matter, a concept crucial to understanding an article's main ideas, a term explicitly defined or explained in a post, or a word or phrase that are frequently used or emphasized in the post. Do not include the abbreviation in the 'term' field.")
	abbreviation: str = dspy.OutputField(desc="Populate the abbreviation field if the term is abbreviated in the article, ensure that it is not pluralized. If there is no abbreviation, populate the abbreviation field with an empty string.")
	definition: str = dspy.OutputField(desc="A definition of the term. Lightly edit the definition so it can stand alone outside the context of the post, but ensure that you do not add any information that is not present in the original text.")
	details: str = dspy.OutputField(desc="Text from the post that expounds a bit on the term, adding texture and details beyond the definition. The 'details' field can be empty if there is no additional context to provide and multiple paragraphs if there is more than one piece of context to provide.")
	synonyms: List[str] = dspy.OutputField(desc="Any synonyms, acronyms, or alternative terms that are used in the post")

	# Find key terms for the post and terms where their definition might not be clear to the reader
	class ExtractTerms(dspy.Signature):
	"""Find key terms for the post and terms where their definition might not be clear to the reader, from a markdown blog post. Ignore all text between markdown code blocks."""

	post: str = dspy.InputField(desc="the markdown blog post")
	terms: List[Term] = dspy.OutputField(desc="Array of glossary terms.")

	extractTerms = dspy.Predict(ExtractTerms)

	# Get the terms from the posts
	posts_path = Path("../_posts")
	glossary = []
	for post_file in sorted(posts_path.glob('*.md')):
	print(f"Processing {post_file}")
	with open(post_file, 'r') as f:
	post_content = f.read()
	post_content = re.split(r'\n---\n', post_content, maxsplit=2)[-1]
	try:
	terms = extractTerms(post=post_content)
	if 'glossary' not in globals():
	glossary = []
	except Exception as e:
	print(f"Failed to process {post_file}: {e}")
	continue
	for term in terms.terms:
	if term not in glossary:
	if str(post_file).startswith('../'):
	term['path'] = str(post_file)[3:]
	else:
	term['path'] = post_file
	print(f"Adding term {term}")
	glossary.append(term)
	break

	#
	# Condense the glossary to unique terms
	#

	# Compare two term dicts to see if they are the same term
	def compare_terms(term1, term2):
	if term1['term'].lower() == term2['term'].lower():
	return True
	if any(syn.lower() in [s.lower() for s in term2['synonyms']] for syn in term1['synonyms']):
	return True
	if term1['term'].lower() in [s.lower() for s in term2['synonyms']]:
	return True

	return False

	# Condense the glossary by finding identical terms and merging their definitions, details, and synonyns.
	merged_glossary = {}
	for term in glossary:
	found = False
	for key in merged_glossary:
	if compare_terms(term, merged_glossary[key]):
	found = True
	merged_glossary[key]['details'] += "\n\n" + term['details']
	merged_glossary[key]['synonyms'] += term['synonyms']
	merged_glossary[key]['pages'].append(term['path'])
	merged_glossary[key]['synonyms'] = list(set(merged_glossary[key]['synonyms']))
	break
	if not found:
	page = term['path']
	term['pages'] = [page]
	del term['path']
	merged_glossary[term['term']] = term

	# Sort the merged_glossary by keys
	sorted_glossary = dict(sorted(merged_glossary.items()))

	# Create the _data directory if it doesn't exist
	Path("../_data").mkdir(parents=True, exist_ok=True)

	# Write the sorted glossary values to a YAML file
	with open('../_data/glossary.yaml', 'w') as yaml_file:
	yaml.dump(list(sorted_glossary.values()), yaml_file, default_flow_style=False, sort_keys=False)