Extract all code from a set of Doxygen generated documentation, for use when recovering code that has otherwise been lost
# This extracts all the code from a set of Doxygen generated documentation | |
# where the code is embedded and highlighted. You really only need to use this | |
# when attempting to recover lost code and you still have the docs. | |
# Writes all code out into the original directory structure relative to where | |
# the script is executed. | |
# Run: `python extract_code_from_doxygen.py URL_TO_DOXYGEN_FILES_PAGE` | |
# e.g. `python extract_code_from_doxygen.py http://swf2svg.sourceforge.net/azar/doc/files.html` | |
import sys | |
import re | |
import os | |
from urllib.parse import urlparse | |
import requests | |
import lxml.html | |
listing = sys.argv[1] # The files.html doxygen url | |
base_url = "/".join(listing.split("/")[:-1]) | |
response = requests.get(listing) | |
root = lxml.html.fromstring(response.content) | |
file_nodes = root.xpath("//table/tr/td[1]/a[2]") | |
for node in file_nodes: | |
code_url = base_url + "/" + node.attrib["href"] | |
response = requests.get(code_url) | |
code_root = lxml.html.fromstring(response.content) | |
h1 = code_root.xpath("//h1")[0].text | |
base_path, filename = os.path.split(h1) | |
# Extremely hacky way to make a windows/linux path relative | |
base_path = re.sub("^([a-zA-Z]:)*/", "", base_path) | |
try: | |
os.makedirs(base_path) | |
except FileExistsError as e: | |
pass | |
pre = code_root.xpath("//pre")[0].text_content() | |
code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE) | |
print("Writing %s/%s" % (base_path, filename)) | |
with open("%s/%s" % (base_path, filename), "w") as f: | |
f.write(code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment