rbatista191/fit_metatags_into_seo_practices.py

## fit_metatags_into_seo_practices.py
"""
SEO Meta Tag Optimizer for Markdown Files in a Specific Folder

This script optimizes SEO meta tags (title and description) for Markdown files
in a specific folder using OpenAI's GPT-4 model. It processes Markdown files
and updates their frontmatter with optimized title and description.

Features:
- Processes Markdown files in a specified folder
- Extracts and parses YAML frontmatter
- Generates SEO-optimized titles and descriptions using GPT-4
- Updates frontmatter with new titles and descriptions
- Implements retry logic for API calls
- Provides detailed logging

Requirements:
- Python 3.x
- openai
- pyyaml
- python-dotenv

Usage:
1. Set up a .env file with your OpenAI API key: OPENAI_API_KEY=your_api_key_here
2. Specify the target folder path in the script
3. Run the script: python seo_meta_optimizer.py

Note: This script makes API calls to OpenAI, which may incur costs.
"""

import os
import glob
import openai
import json
import yaml
from dotenv import load_dotenv
import logging

# Load the OpenAI API key from the .env file
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# Check OpenAI version and set up client accordingly
if hasattr(openai, 'OpenAI'):
    # New version (1.0.0 and later)
    client = openai.OpenAI()
    def create_chat_completion(**kwargs):
        return client.chat.completions.create(**kwargs)
else:
    # Older version
    client = openai
    def create_chat_completion(**kwargs):
        return client.ChatCompletion.create(**kwargs)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def process_folder(folder_path):
    """
    Process all Markdown files in the specified folder.

    Args:
    folder_path (str): The path to the folder containing Markdown files.
    """
    md_files = glob.glob(os.path.join(folder_path, '*.md'))
    for md_file in md_files:
        logging.info(f'Processing file: {md_file}')
        try:
            process_file(md_file)
        except Exception as e:
            logging.error(f'Error processing file {md_file}: {e}')

def extract_frontmatter(content):
    """
    Extract YAML frontmatter from Markdown content.

    Args:
    content (str): The full content of the Markdown file.

    Returns:
    tuple: A tuple containing the parsed frontmatter (dict) and the remaining body (str).
    """
    if content.startswith('---'):
        _, frontmatter, body = content.split('---', 2)
        try:
            frontmatter = yaml.safe_load(frontmatter)
        except yaml.YAMLError as e:
            logging.error(f"Error loading frontmatter YAML: {e}")
            return {}, content
        return frontmatter, body
    else:
        return {}, content

def process_file(filepath):
    """
    Process a single Markdown file.

    Args:
    filepath (str): The path to the Markdown file.
    """
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read()

    frontmatter, body = extract_frontmatter(content)

    current_title = frontmatter.get('title', '')
    current_description = frontmatter.get('description', '')

    filename = os.path.basename(filepath)  # Get the filename from the path

    new_title = current_title
    new_description = current_description

    if not validate_length(current_title, 'title'):
        new_title = generate_optimal_text(body, 'title', filename)

    if not validate_length(current_description, 'description'):
        new_description = generate_optimal_text(body, 'description', filename)

    if new_title != current_title or new_description != current_description:
        update_frontmatter(filepath, frontmatter, new_title, new_description)
    else:
        logging.info(f'No changes needed for file: {filepath}')

def generate_optimal_text(body, text_type, filename):
    """
    Generate optimized title or description using GPT-4.

    Args:
    body (str): The main content of the Markdown file.
    text_type (str): Either 'title' or 'description'.
    filename (str): The name of the file being processed.

    Returns:
    str: The generated title or description.

    Raises:
    RuntimeError: If maximum retries are exceeded.
    """
    length_range = "35 to 45" if text_type == 'title' else "110 to 160"
    filename_without_extension = os.path.splitext(filename)[0]  # Remove file extension

    prompt = (
        f"Based on the following markdown content and filename, generate an optimal {text_type} "
        f"(ranging strictly from {length_range} characters) in JSON format. "
        "Stick to the requirements given in terms of character length at all costs! "
        f"The JSON key should be '{text_type}'. "
        f"For the title, try to incorporate relevant words from the filename: '{filename_without_extension}'. "
        "Ensure the title is catchy and SEO-friendly.\n\n"
        "Content:\n" + body[:500]  # Limiting content to first 500 characters to avoid token limits
    )

    retry_count = 0
    max_retries = 10
    while retry_count < max_retries:
        try:
            response = create_chat_completion(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a world-class SEO assistant that helps optimize meta tags. You are very strict with the requirements given in terms of character length."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )

            # Handle response based on OpenAI version
            if hasattr(openai, 'OpenAI'):
                result = response.choices[0].message.content.strip()
            else:
                result = response.choices[0].message['content'].strip()

            result_json = json.loads(result)
            generated_text = result_json[text_type]
            if validate_length(generated_text, text_type):
                return generated_text
        except Exception as e:
            logging.error(f"Error in generating optimal {text_type}: {e}")
        logging.info(f"Generated {text_type} didn't fit the criteria. Retrying...")
        retry_count += 1

    raise RuntimeError(f"Exceeded maximum retries for generating optimal {text_type}")

def validate_length(text, text_type):
    """
    Validate the length of the title or description.

    Args:
    text (str): The text to validate.
    text_type (str): Either 'title' or 'description'.

    Returns:
    bool: True if the text meets the length criteria, False otherwise.
    """
    if text_type == 'title':
        valid = 35 <= len(text) <= 45
    else:  # description
        valid = 110 <= len(text) <= 160

    if not valid:
        logging.warning(f"{text_type.capitalize()} length out of range: {len(text)}")
    return valid

def update_frontmatter(filepath, frontmatter, title, description):
    """
    Update the frontmatter of a Markdown file with new title and description.

    Args:
    filepath (str): The path to the Markdown file.
    frontmatter (dict): The existing frontmatter.
    title (str): The new title.
    description (str): The new description.
    """
    old_title = frontmatter.get('title', '')
    old_description = frontmatter.get('description', '')

    # Replace colons with hyphens in title and description
    title = title.replace(':', ' -')
    description = description.replace(':', ' -')

    frontmatter['title'] = title
    frontmatter['description'] = description

    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
    except IOError as e:
        logging.error(f'Error reading file {filepath}: {e}')
        return

    if content.startswith('---'):
        parts = content.split('---', 2)
        if len(parts) < 3:
            logging.error('File split did not retrieve expected three parts')
            return
        _, _, body = parts
        new_content = f"---\n{yaml.dump(frontmatter, allow_unicode=True)}---\n{body}"
    else:
        new_content = f"---\n{yaml.dump(frontmatter, allow_unicode=True)}---\n{content}"

    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(new_content)

    logging.info(f'Updated file: {filepath}')
    logging.info(f'Old title: {old_title} -> New title: {title}')
    logging.info(f'Old description: {old_description} -> New description: {description}')

# Main execution
if __name__ == "__main__":
    # Specify the target folder path (can be relative or absolute)
    target_folder = '../content/docs/blog'
    # Or use an absolute path:
    # target_folder =
    # Convert to absolute path if it's relative
    target_folder = os.path.abspath(target_folder)

    if not os.path.isdir(target_folder):
        logging.error(f"The specified folder does not exist: {target_folder}")
    else:
        logging.info(f'Starting to process files in folder: {target_folder}')
        process_folder(target_folder)
        logging.info('Processing completed.')
	"""
	SEO Meta Tag Optimizer for Markdown Files in a Specific Folder

	This script optimizes SEO meta tags (title and description) for Markdown files
	in a specific folder using OpenAI's GPT-4 model. It processes Markdown files
	and updates their frontmatter with optimized title and description.

	Features:
	- Processes Markdown files in a specified folder
	- Extracts and parses YAML frontmatter
	- Generates SEO-optimized titles and descriptions using GPT-4
	- Updates frontmatter with new titles and descriptions
	- Implements retry logic for API calls
	- Provides detailed logging

	Requirements:
	- Python 3.x
	- openai
	- pyyaml
	- python-dotenv

	Usage:
	1. Set up a .env file with your OpenAI API key: OPENAI_API_KEY=your_api_key_here
	2. Specify the target folder path in the script
	3. Run the script: python seo_meta_optimizer.py

	Note: This script makes API calls to OpenAI, which may incur costs.
	"""

	import os
	import glob
	import openai
	import json
	import yaml
	from dotenv import load_dotenv
	import logging

	# Load the OpenAI API key from the .env file
	load_dotenv()
	openai.api_key = os.getenv('OPENAI_API_KEY')

	# Check OpenAI version and set up client accordingly
	if hasattr(openai, 'OpenAI'):
	# New version (1.0.0 and later)
	client = openai.OpenAI()
	def create_chat_completion(**kwargs):
	return client.chat.completions.create(**kwargs)
	else:
	# Older version
	client = openai
	def create_chat_completion(**kwargs):
	return client.ChatCompletion.create(**kwargs)

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def process_folder(folder_path):
	"""
	Process all Markdown files in the specified folder.

	Args:
	folder_path (str): The path to the folder containing Markdown files.
	"""
	md_files = glob.glob(os.path.join(folder_path, '*.md'))
	for md_file in md_files:
	logging.info(f'Processing file: {md_file}')
	try:
	process_file(md_file)
	except Exception as e:
	logging.error(f'Error processing file {md_file}: {e}')

	def extract_frontmatter(content):
	"""
	Extract YAML frontmatter from Markdown content.

	Args:
	content (str): The full content of the Markdown file.

	Returns:
	tuple: A tuple containing the parsed frontmatter (dict) and the remaining body (str).
	"""
	if content.startswith('---'):
	_, frontmatter, body = content.split('---', 2)
	try:
	frontmatter = yaml.safe_load(frontmatter)
	except yaml.YAMLError as e:
	logging.error(f"Error loading frontmatter YAML: {e}")
	return {}, content
	return frontmatter, body
	else:
	return {}, content

	def process_file(filepath):
	"""
	Process a single Markdown file.

	Args:
	filepath (str): The path to the Markdown file.
	"""
	with open(filepath, 'r', encoding='utf-8') as file:
	content = file.read()

	frontmatter, body = extract_frontmatter(content)

	current_title = frontmatter.get('title', '')
	current_description = frontmatter.get('description', '')

	filename = os.path.basename(filepath) # Get the filename from the path

	new_title = current_title
	new_description = current_description

	if not validate_length(current_title, 'title'):
	new_title = generate_optimal_text(body, 'title', filename)

	if not validate_length(current_description, 'description'):
	new_description = generate_optimal_text(body, 'description', filename)

	if new_title != current_title or new_description != current_description:
	update_frontmatter(filepath, frontmatter, new_title, new_description)
	else:
	logging.info(f'No changes needed for file: {filepath}')

	def generate_optimal_text(body, text_type, filename):
	"""
	Generate optimized title or description using GPT-4.

	Args:
	body (str): The main content of the Markdown file.
	text_type (str): Either 'title' or 'description'.
	filename (str): The name of the file being processed.

	Returns:
	str: The generated title or description.

	Raises:
	RuntimeError: If maximum retries are exceeded.
	"""
	length_range = "35 to 45" if text_type == 'title' else "110 to 160"
	filename_without_extension = os.path.splitext(filename)[0] # Remove file extension

	prompt = (
	f"Based on the following markdown content and filename, generate an optimal {text_type} "
	f"(ranging strictly from {length_range} characters) in JSON format. "
	"Stick to the requirements given in terms of character length at all costs! "
	f"The JSON key should be '{text_type}'. "
	f"For the title, try to incorporate relevant words from the filename: '{filename_without_extension}'. "
	"Ensure the title is catchy and SEO-friendly.\n\n"
	"Content:\n" + body[:500] # Limiting content to first 500 characters to avoid token limits
	)

	retry_count = 0
	max_retries = 10
	while retry_count < max_retries:
	try:
	response = create_chat_completion(
	model="gpt-4o",
	messages=[
	{"role": "system", "content": "You are a world-class SEO assistant that helps optimize meta tags. You are very strict with the requirements given in terms of character length."},
	{"role": "user", "content": prompt}
	],
	response_format={"type": "json_object"}
	)

	# Handle response based on OpenAI version
	if hasattr(openai, 'OpenAI'):
	result = response.choices[0].message.content.strip()
	else:
	result = response.choices[0].message['content'].strip()

	result_json = json.loads(result)
	generated_text = result_json[text_type]
	if validate_length(generated_text, text_type):
	return generated_text
	except Exception as e:
	logging.error(f"Error in generating optimal {text_type}: {e}")
	logging.info(f"Generated {text_type} didn't fit the criteria. Retrying...")
	retry_count += 1

	raise RuntimeError(f"Exceeded maximum retries for generating optimal {text_type}")

	def validate_length(text, text_type):
	"""
	Validate the length of the title or description.

	Args:
	text (str): The text to validate.
	text_type (str): Either 'title' or 'description'.

	Returns:
	bool: True if the text meets the length criteria, False otherwise.
	"""
	if text_type == 'title':
	valid = 35 <= len(text) <= 45
	else: # description
	valid = 110 <= len(text) <= 160

	if not valid:
	logging.warning(f"{text_type.capitalize()} length out of range: {len(text)}")
	return valid

	def update_frontmatter(filepath, frontmatter, title, description):
	"""
	Update the frontmatter of a Markdown file with new title and description.

	Args:
	filepath (str): The path to the Markdown file.
	frontmatter (dict): The existing frontmatter.
	title (str): The new title.
	description (str): The new description.
	"""
	old_title = frontmatter.get('title', '')
	old_description = frontmatter.get('description', '')

	# Replace colons with hyphens in title and description
	title = title.replace(':', ' -')
	description = description.replace(':', ' -')

	frontmatter['title'] = title
	frontmatter['description'] = description

	try:
	with open(filepath, 'r', encoding='utf-8') as file:
	content = file.read()
	except IOError as e:
	logging.error(f'Error reading file {filepath}: {e}')
	return

	if content.startswith('---'):
	parts = content.split('---', 2)
	if len(parts) < 3:
	logging.error('File split did not retrieve expected three parts')
	return
	_, _, body = parts
	new_content = f"---\n{yaml.dump(frontmatter, allow_unicode=True)}---\n{body}"
	else:
	new_content = f"---\n{yaml.dump(frontmatter, allow_unicode=True)}---\n{content}"

	with open(filepath, 'w', encoding='utf-8') as file:
	file.write(new_content)

	logging.info(f'Updated file: {filepath}')
	logging.info(f'Old title: {old_title} -> New title: {title}')
	logging.info(f'Old description: {old_description} -> New description: {description}')

	# Main execution
	if __name__ == "__main__":
	# Specify the target folder path (can be relative or absolute)
	target_folder = '../content/docs/blog'
	# Or use an absolute path:
	# target_folder =
	# Convert to absolute path if it's relative
	target_folder = os.path.abspath(target_folder)

	if not os.path.isdir(target_folder):
	logging.error(f"The specified folder does not exist: {target_folder}")
	else:
	logging.info(f'Starting to process files in folder: {target_folder}')
	process_folder(target_folder)
	logging.info('Processing completed.')