Created
September 25, 2021 01:39
-
-
Save swhume/30b09465f87a6dc85a1337eb70753d61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# scrape XML from a spec grabber generated wiki page | |
from bs4 import BeautifulSoup | |
import os | |
import sys | |
import getopt | |
# get the command-line arguments - page id is required | |
scrape_xml_file = "scrape.xml" | |
path = "" | |
page_id = None | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "i:o:p:") | |
except getopt.GetoptError as e: | |
print(str(e)) | |
print("Usage: %s -i page id to retrieve page from wiki " % sys.argv[0]) | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt == "-i": | |
page_id = arg | |
elif opt == "-p": | |
if len(arg) > 0: | |
path = os.path.join(arg, "") | |
elif opt == "-o": | |
scrape_xml_file = arg | |
if page_id is None or not page_id.isdigit(): | |
print("Invalid argument: %s. Usage: %s -i <page-id> [-o output xml file] [-p wiki cli path]." % (page_id, sys.argv[0])) | |
sys.exit(2) | |
# get the wiki page referenced by the page id using the confluence command CLI tool | |
html_filename = "scrape_source_xml.html" | |
cmdline = f"{path}confluence --action render --id \"{page_id}\" --noConvert --file \"{html_filename}\"" | |
resp = os.system(cmdline) | |
# if the wiki CLI request fails do not continue | |
if resp < 0: | |
sys.exit(2) | |
# strip out XML content from the html file generated from the wiki | |
try: | |
fh = open(html_filename, "r") | |
except IOError as e: | |
print("Error reading to HTML file %s. %s" % (html_filename, e)) | |
sys.exit(2) | |
# parse the html file to extract the XML | |
soup = BeautifulSoup(fh, "html.parser") | |
pre_content = soup.find("pre", class_="syntaxhighlighter-pre") | |
if pre_content is None: | |
print("The pre element with class=syntaxhighlighter-pre is not found in %s" % html_filename) | |
sys.exit(2) | |
raw_xml = pre_content.get_text() | |
# save XML file | |
try: | |
xml_file = open(scrape_xml_file, "w") | |
xml_file.write(raw_xml) | |
xml_file.close() | |
except IOError as e: | |
print("Error writing to XML file %s. %s" % (scrape_xml_file, e)) | |
sys.exit(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment