Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import re, argparse
from pathlib import Path
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(description='Create excel top remediations report based on a HTML report')
parser.add_argument('html_file_path', metavar='htmlReportPath', type=str, help='path to the HTML top remediations report')
parser.add_argument('csv_output_path', metavar='csvOutputPath', type=str, help='path to output CSV file (will be created)')
parser.add_argument('--opco', nargs=1, type=str, help='filter assets based on OPCO')
parser.add_argument('--remove-citrix', default=False, dest='remove_citrix', help='Remove citrix servers', action='store_true')
args = parser.parse_args()
# check if files exist
html_file_check = Path(args.html_file_path)
if not html_file_check.is_file():
print("file '{}' does not exist!".format(args.html_file_path))
return 1
csv_file_check = Path(args.csv_output_path)
if csv_file_check.is_file():
if input("file '{}' already exist! OK to overwrite this? (y/n)".format(csv_file_check)).strip() != 'y':
print("aborting, specify another output file!")
return 0
html_file_data = None
with open(args.html_file_path, 'r') as f:
html_file_data = f.read()
soup = BeautifulSoup(html_file_data, features='html.parser')
text_content = '\n'.join(span.text.replace('\xa0','') for span in soup.find_all('span'))
# report summary regexp: r'Top \d{1,3} Remediations by Risk\n(?<report_date>\w+ \d{1,2}, \d{4}).+\n(?<report_name>.+)\n(?<report_summary>(Applying\n\d{1,3}\nRemediations\nWill Remediate\n\d{1,3}(\.\d)*%\n\d{1,3}(\.\d)*%\nVulnerabilities\nRisk\nAffecting\n\d{1,3}\nAssets\n\d{1,3}(\.\d)*%\npublishedexploits\n\d{1,3}(\.\d)*%\navailablemalware kits\n))'
#summary_pattern = re.compile(r'Top \d{1,3} Remediations by Risk\n(?P<report_date>\w+ \d{1,2}, \d{4}).+\n(?P<report_name>.+)\n(?P<report_summary>(Applying\n\d{1,3}\nRemediations\nWill Remediate\n\d{1,3}(\.\d)*%\n\d{1,3}(\.\d)*%\nVulnerabilities\nRisk\nAffecting\n\d{1,3}\nAssets\n\d{1,3}(\.\d)*%\npublishedexploits\n\d{1,3}(\.\d)*%\navailablemalware kits\n))')
# impact regexp: r'(Remediation\nAssets\nVulnerabilities\nRisk\n)?(?P<remediation_priority>\d{1,4})\. (?P<remediation_title>.+)\n\d{1,5}\n.+\n.+\n.+\n(?P<remediation_impact>.+)\n'
page_impact_pattern = re.compile(r'(Remediation\nAssets\nVulnerabilities\nRisk\n)?(?P<remediation_priority>\d{1,4})\. (?P<remediation_title>.+)\n\d{1,5}\n.+\n.+\n.+\n(?P<remediation_impact>.+)\n')
page_anchor_pattern = re.compile(r'(?P<page_num>of \d{1,4})\n')
page_contents_pattern = re.compile(r'((?P<remediation_priority>\d{1,3})\. (?P<remediation_title>.+)\nRemediation Steps\n(?P<remediation_steps>(.+\n+)+?))*Assets\nName\nIP Address\nSite\n(?P<page_contents>(.+\n)+?)of \d{1,2}')
page_title_pattern = re.compile(r'(?P<remediation_priority>\d{1,3})\. (?P<remediation_title>.+\n)')
url_pattern = re.compile(r'(?P<url>http(s*)://.*)\n')
assets_pattern = re.compile(r'(?P<host_name>.+)\n(?P<ip_address>([0-9]{1,3}\.){3}[0-9]{1,3})\n(?:.+)\n')
report_data = []
remediation_impacts = {}
page_num = page_anchor_pattern.search(text_content).group('page_num')
last_remediation_title = None
last_remediation_steps = None
last_remediation_priority = None
for page in text_content.split(page_num):
normalized_page = page + page_num
page_impact_search = page_impact_pattern.search(normalized_page)
if page_impact_search:
for match in re.finditer(page_impact_pattern, normalized_page):
remediation_title = match.group('remediation_title').lower()
remediation_impact = int(match.group('remediation_impact').replace(',',''))
remediation_impacts.update({remediation_title: remediation_impact})
page_contents_search = page_contents_pattern.search(normalized_page)
if not page_contents_search:
continue
remediation_title = page_contents_search.group('remediation_title')
remediation_priority = page_contents_search.group('remediation_priority')
if not remediation_title and not remediation_priority:
remediation_title = last_remediation_title
remediation_priority = last_remediation_priority
else:
last_remediation_title = remediation_title
last_remediation_priority = remediation_priority
remediation_steps = None
potential_remediation_steps = page_contents_search.group('remediation_steps')
if not potential_remediation_steps:
remediation_steps = last_remediation_steps
else:
remediation_steps = ' '.join(potential_remediation_steps.split()) # remove uncessecary whitespace chars
last_remediation_steps = remediation_steps
page_contents = page_contents_search.group('page_contents')
report_data += [
{
'hostname': match.group('host_name'),
'ip address': match.group('ip_address'),
'remediation steps': remediation_steps,
'remediation': remediation_title,
'remediation impact': remediation_impacts[remediation_title.lower()],
'priority': remediation_priority,
} for match in re.finditer(assets_pattern, page_contents)
]
# sanity-check, compare the amount of collected remediations to the numbers presented in the first page!
remediations = []
remediation_pattern = re.compile(r'(\d{1,3}\. (?P<remediation>.+)\n\d{1,3}\n{1,3}.+\n.+\n.+\n.+\n)')
for match in re.finditer(remediation_pattern, text_content):
if match.group('remediation').lower() not in remediations:
remediations.append(match.group('remediation').lower())
collected_remediations = list(set([rd['remediation'].lower() for rd in report_data]))
try:
assert len(collected_remediations) == len(remediations)
except AssertionError:
print(' -- Remediations:\n'+'\n'.join(remediations))
print(' -- Collected remediations:\n'+'\n'.join(collected_remediations))
print("! REMEDIATION MISMATCH ! len(rems): {} - len(col_rems): {}".format(len(remediations), len(collected_remediations)))
exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment