swhume/scrape_wiki_xml.py

## scrape_wiki_xml.py
# scrape XML from a spec grabber generated wiki page
from bs4 import BeautifulSoup
import os
import sys
import getopt

# get the command-line arguments - page id is required
scrape_xml_file = "scrape.xml"
path = ""
page_id = None
try:
    opts, args = getopt.getopt(sys.argv[1:], "i:o:p:")
except getopt.GetoptError as e:
    print(str(e))
    print("Usage: %s -i page id to retrieve page from wiki " % sys.argv[0])
    sys.exit(2)

for opt, arg in opts:
    if opt == "-i":
        page_id = arg
    elif opt == "-p":
        if len(arg) > 0:
            path = os.path.join(arg, "")
    elif opt == "-o":
        scrape_xml_file = arg

if page_id is None or not page_id.isdigit():
    print("Invalid argument: %s. Usage: %s -i <page-id> [-o output xml file] [-p wiki cli path]." % (page_id, sys.argv[0]))
    sys.exit(2)

# get the wiki page referenced by the page id using the confluence command CLI tool
html_filename = "scrape_source_xml.html"
cmdline = f"{path}confluence --action render --id \"{page_id}\" --noConvert --file \"{html_filename}\""
resp = os.system(cmdline)
# if the wiki CLI request fails do not continue
if resp < 0:
    sys.exit(2)

# strip out XML content from the html file generated from the wiki
try:
    fh = open(html_filename, "r")
except IOError as e:
    print("Error reading to HTML file %s. %s" % (html_filename, e))
    sys.exit(2)

# parse the html file to extract the XML
soup = BeautifulSoup(fh, "html.parser")
pre_content = soup.find("pre", class_="syntaxhighlighter-pre")
if pre_content is None:
    print("The pre element with class=syntaxhighlighter-pre is not found in %s" % html_filename)
    sys.exit(2)
raw_xml = pre_content.get_text()

# save XML file
try:
    xml_file = open(scrape_xml_file, "w")
    xml_file.write(raw_xml)
    xml_file.close()
except IOError as e:
    print("Error writing to XML file %s. %s" % (scrape_xml_file, e))
    sys.exit(2)
	# scrape XML from a spec grabber generated wiki page
	from bs4 import BeautifulSoup
	import os
	import sys
	import getopt

	# get the command-line arguments - page id is required
	scrape_xml_file = "scrape.xml"
	path = ""
	page_id = None
	try:
	opts, args = getopt.getopt(sys.argv[1:], "i:o:p:")
	except getopt.GetoptError as e:
	print(str(e))
	print("Usage: %s -i page id to retrieve page from wiki " % sys.argv[0])
	sys.exit(2)

	for opt, arg in opts:
	if opt == "-i":
	page_id = arg
	elif opt == "-p":
	if len(arg) > 0:
	path = os.path.join(arg, "")
	elif opt == "-o":
	scrape_xml_file = arg

	if page_id is None or not page_id.isdigit():
	print("Invalid argument: %s. Usage: %s -i <page-id> [-o output xml file] [-p wiki cli path]." % (page_id, sys.argv[0]))
	sys.exit(2)

	# get the wiki page referenced by the page id using the confluence command CLI tool
	html_filename = "scrape_source_xml.html"
	cmdline = f"{path}confluence --action render --id \"{page_id}\" --noConvert --file \"{html_filename}\""
	resp = os.system(cmdline)
	# if the wiki CLI request fails do not continue
	if resp < 0:
	sys.exit(2)

	# strip out XML content from the html file generated from the wiki
	try:
	fh = open(html_filename, "r")
	except IOError as e:
	print("Error reading to HTML file %s. %s" % (html_filename, e))
	sys.exit(2)

	# parse the html file to extract the XML
	soup = BeautifulSoup(fh, "html.parser")
	pre_content = soup.find("pre", class_="syntaxhighlighter-pre")
	if pre_content is None:
	print("The pre element with class=syntaxhighlighter-pre is not found in %s" % html_filename)
	sys.exit(2)
	raw_xml = pre_content.get_text()

	# save XML file
	try:
	xml_file = open(scrape_xml_file, "w")
	xml_file.write(raw_xml)
	xml_file.close()
	except IOError as e:
	print("Error writing to XML file %s. %s" % (scrape_xml_file, e))
	sys.exit(2)