Skip to content

Instantly share code, notes, and snippets.

@Jonty
Created December 28, 2020 19:11
Show Gist options
  • Save Jonty/2376f46818462345fdc81e029b62ce57 to your computer and use it in GitHub Desktop.
Save Jonty/2376f46818462345fdc81e029b62ce57 to your computer and use it in GitHub Desktop.
Extract all code from a set of Doxygen generated documentation, for use when recovering code that has otherwise been lost
# This extracts all the code from a set of Doxygen generated documentation
# where the code is embedded and highlighted. You really only need to use this
# when attempting to recover lost code and you still have the docs.
# Writes all code out into the original directory structure relative to where
# the script is executed.
# Run: `python extract_code_from_doxygen.py URL_TO_DOXYGEN_FILES_PAGE`
# e.g. `python extract_code_from_doxygen.py http://swf2svg.sourceforge.net/azar/doc/files.html`
import sys
import re
import os
from urllib.parse import urlparse
import requests
import lxml.html
listing = sys.argv[1] # The files.html doxygen url
base_url = "/".join(listing.split("/")[:-1])
response = requests.get(listing)
root = lxml.html.fromstring(response.content)
file_nodes = root.xpath("//table/tr/td[1]/a[2]")
for node in file_nodes:
code_url = base_url + "/" + node.attrib["href"]
response = requests.get(code_url)
code_root = lxml.html.fromstring(response.content)
h1 = code_root.xpath("//h1")[0].text
base_path, filename = os.path.split(h1)
# Extremely hacky way to make a windows/linux path relative
base_path = re.sub("^([a-zA-Z]:)*/", "", base_path)
try:
os.makedirs(base_path)
except FileExistsError as e:
pass
pre = code_root.xpath("//pre")[0].text_content()
code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE)
print("Writing %s/%s" % (base_path, filename))
with open("%s/%s" % (base_path, filename), "w") as f:
f.write(code)
@jankaifer
Copy link

jankaifer commented Jun 6, 2024

Thanks for sharing this.

Your script didn't work for me. Here is a modified version that recursively downloads all files, even deeply nested ones.

The entrypoint is a bit different though:

import os
import shutil
from bs4 import BeautifulSoup

import requests

def download_path(url, destination):
    base_url = "/".join(url.split("/")[:-1])
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = list(soup.select("div.title"))[0].text
    parts = title.split(" ")
    filename = parts[0]
    destination += "/" + filename

    print()
    print(f"Working on '{filename}'")
    if parts[1] == "Directory":
        try:
            shutil.rmtree(destination)
        except:
            pass
        os.makedirs(destination)

        children = soup.select("table td.memItemRight a:first-child")
        for node in children:
            code_url = base_url + "/" + node.get("href")
            download_path(code_url, destination)
        return

    codeLinks = list(soup.select("div.contents > p > a"))
    if len(codeLinks) == 0:
        print(f"file {filename} does not have source code available")
        print(url)
        return

    response = requests.get(base_url + "/" + codeLinks[0].get("href"))
    soup = BeautifulSoup(response.content, 'html.parser')

    codeLines = [
        "// source: {url}"
        ""
    ]

    lines = soup.select(".fragment > .line")
    for line in lines:
        parsedLine = " ".join(line.get_text().split(" ")[1:])
        codeLines.append(parsedLine)
    print(f"Writing {destination}")
    with open(destination, "w") as f:
        f.write("\n".join(codeLines))

download_path(
    "https://www-sop.inria.fr/teams/galaad/software/bbx/dir_f1b9e769e42d03ec13d97558ab2c4c46.html",
    "./files"
)

@Jonty
Copy link
Author

Jonty commented Jun 7, 2024

@jankaifer Nice! Thanks for posting it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment