Created December 28, 2020 19:11
Extract all code from a set of Doxygen generated documentation, for use when recovering code that has otherwise been lost
# This extracts all the code from a set of Doxygen generated documentation
# where the code is embedded and highlighted. You really only need to use this
# when attempting to recover lost code and you still have the docs.
# Writes all code out into the original directory structure relative to where
# the script is executed.
# e.g. `python`
import sys
import re
import os
from urllib.parse import urlparse
import requests
import lxml.html
listing = sys.argv[1] # The files.html doxygen url
base_url = "/".join(listing.split("/")[:-1])
response = requests.get(listing)
root = lxml.html.fromstring(response.content)
file_nodes = root.xpath("//table/tr/td[1]/a[2]")
for node in file_nodes:
code_url = base_url + "/" + node.attrib["href"]
response = requests.get(code_url)
code_root = lxml.html.fromstring(response.content)
h1 = code_root.xpath("//h1")[0].text
base_path, filename = os.path.split(h1)
# Extremely hacky way to make a windows/linux path relative
base_path = re.sub("^([a-zA-Z]:)*/", "", base_path)
except FileExistsError as e:
pre = code_root.xpath("//pre")[0].text_content()
code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE)
print("Writing %s/%s" % (base_path, filename))
with open("%s/%s" % (base_path, filename), "w") as f:
