dbetm/deduplicate_anchors_singlehtml_sphinx.py

## deduplicate_anchors_singlehtml_sphinx.py
import argparse
from typing import Any

from bs4 import BeautifulSoup # pip install beautifulsoup4


def update_h3_child(section: Any, new_id: str):
    """Find the first child of the given section, if it exists
    then update its anchor with the given `new_id`."""
    h3_first_child = section.find("h3")

    if h3_first_child:
        h3_first_child.a["href"] = f"#{new_id}"


def deduplicate_anchors(filepath: str, verbose: bool = True) -> None:
    """Deduplicate section id's and anchors of toc-tree when the
    HTML was build with `singlehtml` builder and there are sections
    with the same name. It will overwrite the HTML file.

    Temporal fix of current issue of sphinx:
    https://github.com/sphinx-doc/sphinx/issues/4814
    """
    # Read the HTML file into a string
    with open(filepath, "r") as file:
        html = file.read()

    # Parse the HTML document
    soup = BeautifulSoup(html, "html.parser")

    # Compile all section id's to create a map, with
    # values as lists containing renamed unique ids (assigning a number
    # after the first duplicated found).
    section_elements_with_id = soup.find_all(name="section", id=True)
    sections_ids_map = dict()

    for section in section_elements_with_id:
        id = new_id = section.get("id")

        if not id in sections_ids_map:
            sections_ids_map[id] = list()
        else:
            new_id = f"{id}-{len(sections_ids_map[id]) + 1}"
            section["id"] = new_id

        sections_ids_map[id].append(new_id)
        update_h3_child(section, new_id)

    if verbose: print("section id's map: ", sections_ids_map)

    # Update toc-tree
    toc_tree_div = soup.find("div", class_="toc-tree")
    filename = filepath.split("/")[-1]

    a_elements = toc_tree_div.find_all("a", class_="reference internal")

    for a in a_elements:
        anchor_id = a["href"].split("#")[-1]

        if anchor_id in sections_ids_map:
            # update href of the a element and delete used section id
            a["href"] = f"{filename}#{sections_ids_map[anchor_id].pop(0)}"

    # Save the modified HTML to a file
    with open(filepath, "w") as file:
        file.write(soup.prettify())


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--filepath", type=str, required=False, default="index.html"
    )
    args = parser.parse_args()

    try:
        deduplicate_anchors(args.filepath)
    except Exception as e:
        print("Failed to deduplicated anchors")
	import argparse
	from typing import Any

	from bs4 import BeautifulSoup # pip install beautifulsoup4


	def update_h3_child(section: Any, new_id: str):
	"""Find the first child of the given section, if it exists
	then update its anchor with the given `new_id`."""
	h3_first_child = section.find("h3")

	if h3_first_child:
	h3_first_child.a["href"] = f"#{new_id}"


	def deduplicate_anchors(filepath: str, verbose: bool = True) -> None:
	"""Deduplicate section id's and anchors of toc-tree when the
	HTML was build with `singlehtml` builder and there are sections
	with the same name. It will overwrite the HTML file.

	Temporal fix of current issue of sphinx:
	https://github.com/sphinx-doc/sphinx/issues/4814
	"""
	# Read the HTML file into a string
	with open(filepath, "r") as file:
	html = file.read()

	# Parse the HTML document
	soup = BeautifulSoup(html, "html.parser")

	# Compile all section id's to create a map, with
	# values as lists containing renamed unique ids (assigning a number
	# after the first duplicated found).
	section_elements_with_id = soup.find_all(name="section", id=True)
	sections_ids_map = dict()

	for section in section_elements_with_id:
	id = new_id = section.get("id")

	if not id in sections_ids_map:
	sections_ids_map[id] = list()
	else:
	new_id = f"{id}-{len(sections_ids_map[id]) + 1}"
	section["id"] = new_id

	sections_ids_map[id].append(new_id)
	update_h3_child(section, new_id)

	if verbose: print("section id's map: ", sections_ids_map)

	# Update toc-tree
	toc_tree_div = soup.find("div", class_="toc-tree")
	filename = filepath.split("/")[-1]

	a_elements = toc_tree_div.find_all("a", class_="reference internal")

	for a in a_elements:
	anchor_id = a["href"].split("#")[-1]

	if anchor_id in sections_ids_map:
	# update href of the a element and delete used section id
	a["href"] = f"{filename}#{sections_ids_map[anchor_id].pop(0)}"

	# Save the modified HTML to a file
	with open(filepath, "w") as file:
	file.write(soup.prettify())


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--filepath", type=str, required=False, default="index.html"
	)
	args = parser.parse_args()

	try:
	deduplicate_anchors(args.filepath)
	except Exception as e:
	print("Failed to deduplicated anchors")