Skip to content

Instantly share code, notes, and snippets.

@swhume
Created September 25, 2021 01:39
Show Gist options
  • Save swhume/30b09465f87a6dc85a1337eb70753d61 to your computer and use it in GitHub Desktop.
Save swhume/30b09465f87a6dc85a1337eb70753d61 to your computer and use it in GitHub Desktop.
# scrape XML from a spec grabber generated wiki page
from bs4 import BeautifulSoup
import os
import sys
import getopt
# get the command-line arguments - page id is required
scrape_xml_file = "scrape.xml"
path = ""
page_id = None
try:
opts, args = getopt.getopt(sys.argv[1:], "i:o:p:")
except getopt.GetoptError as e:
print(str(e))
print("Usage: %s -i page id to retrieve page from wiki " % sys.argv[0])
sys.exit(2)
for opt, arg in opts:
if opt == "-i":
page_id = arg
elif opt == "-p":
if len(arg) > 0:
path = os.path.join(arg, "")
elif opt == "-o":
scrape_xml_file = arg
if page_id is None or not page_id.isdigit():
print("Invalid argument: %s. Usage: %s -i <page-id> [-o output xml file] [-p wiki cli path]." % (page_id, sys.argv[0]))
sys.exit(2)
# get the wiki page referenced by the page id using the confluence command CLI tool
html_filename = "scrape_source_xml.html"
cmdline = f"{path}confluence --action render --id \"{page_id}\" --noConvert --file \"{html_filename}\""
resp = os.system(cmdline)
# if the wiki CLI request fails do not continue
if resp < 0:
sys.exit(2)
# strip out XML content from the html file generated from the wiki
try:
fh = open(html_filename, "r")
except IOError as e:
print("Error reading to HTML file %s. %s" % (html_filename, e))
sys.exit(2)
# parse the html file to extract the XML
soup = BeautifulSoup(fh, "html.parser")
pre_content = soup.find("pre", class_="syntaxhighlighter-pre")
if pre_content is None:
print("The pre element with class=syntaxhighlighter-pre is not found in %s" % html_filename)
sys.exit(2)
raw_xml = pre_content.get_text()
# save XML file
try:
xml_file = open(scrape_xml_file, "w")
xml_file.write(raw_xml)
xml_file.close()
except IOError as e:
print("Error writing to XML file %s. %s" % (scrape_xml_file, e))
sys.exit(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment