Jonty/extract_code_from_doxygen.py

## extract_code_from_doxygen.py
# This extracts all the code from a set of Doxygen generated documentation
# where the code is embedded and highlighted. You really only need to use this
# when attempting to recover lost code and you still have the docs.

# Writes all code out into the original directory structure relative to where
# the script is executed.

# Run: `python extract_code_from_doxygen.py URL_TO_DOXYGEN_FILES_PAGE`
# e.g. `python extract_code_from_doxygen.py http://swf2svg.sourceforge.net/azar/doc/files.html`

import sys
import re
import os
from urllib.parse import urlparse

import requests
import lxml.html

listing = sys.argv[1]  # The files.html doxygen url
base_url = "/".join(listing.split("/")[:-1])

response = requests.get(listing)
root = lxml.html.fromstring(response.content)

file_nodes = root.xpath("//table/tr/td[1]/a[2]")
for node in file_nodes:
    code_url = base_url + "/" + node.attrib["href"]
    response = requests.get(code_url)

    code_root = lxml.html.fromstring(response.content)
    h1 = code_root.xpath("//h1")[0].text

    base_path, filename = os.path.split(h1)

    # Extremely hacky way to make a windows/linux path relative
    base_path = re.sub("^([a-zA-Z]:)*/", "", base_path)
    try:
        os.makedirs(base_path)
    except FileExistsError as e:
        pass

    pre = code_root.xpath("//pre")[0].text_content()
    code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE)

    print("Writing %s/%s" % (base_path, filename))
    with open("%s/%s" % (base_path, filename), "w") as f:
        f.write(code)
	# This extracts all the code from a set of Doxygen generated documentation
	# where the code is embedded and highlighted. You really only need to use this
	# when attempting to recover lost code and you still have the docs.

	# Writes all code out into the original directory structure relative to where
	# the script is executed.

	# Run: `python extract_code_from_doxygen.py URL_TO_DOXYGEN_FILES_PAGE`
	# e.g. `python extract_code_from_doxygen.py http://swf2svg.sourceforge.net/azar/doc/files.html`

	import sys
	import re
	import os
	from urllib.parse import urlparse

	import requests
	import lxml.html

	listing = sys.argv[1] # The files.html doxygen url
	base_url = "/".join(listing.split("/")[:-1])

	response = requests.get(listing)
	root = lxml.html.fromstring(response.content)

	file_nodes = root.xpath("//table/tr/td[1]/a[2]")
	for node in file_nodes:
	code_url = base_url + "/" + node.attrib["href"]
	response = requests.get(code_url)

	code_root = lxml.html.fromstring(response.content)
	h1 = code_root.xpath("//h1")[0].text

	base_path, filename = os.path.split(h1)

	# Extremely hacky way to make a windows/linux path relative
	base_path = re.sub("^([a-zA-Z]:)*/", "", base_path)
	try:
	os.makedirs(base_path)
	except FileExistsError as e:
	pass

	pre = code_root.xpath("//pre")[0].text_content()
	code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE)

	print("Writing %s/%s" % (base_path, filename))
	with open("%s/%s" % (base_path, filename), "w") as f:
	f.write(code)