This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, argparse | |
from pathlib import Path | |
from bs4 import BeautifulSoup | |
parser = argparse.ArgumentParser(description='Create excel top remediations report based on a HTML report') | |
parser.add_argument('html_file_path', metavar='htmlReportPath', type=str, help='path to the HTML top remediations report') | |
parser.add_argument('csv_output_path', metavar='csvOutputPath', type=str, help='path to output CSV file (will be created)') | |
parser.add_argument('--opco', nargs=1, type=str, help='filter assets based on OPCO') | |
parser.add_argument('--remove-citrix', default=False, dest='remove_citrix', help='Remove citrix servers', action='store_true') | |
args = parser.parse_args() | |
# check if files exist | |
html_file_check = Path(args.html_file_path) | |
if not html_file_check.is_file(): | |
print("file '{}' does not exist!".format(args.html_file_path)) | |
return 1 | |
csv_file_check = Path(args.csv_output_path) | |
if csv_file_check.is_file(): | |
if input("file '{}' already exist! OK to overwrite this? (y/n)".format(csv_file_check)).strip() != 'y': | |
print("aborting, specify another output file!") | |
return 0 | |
html_file_data = None | |
with open(args.html_file_path, 'r') as f: | |
html_file_data = f.read() | |
soup = BeautifulSoup(html_file_data, features='html.parser') | |
text_content = '\n'.join(span.text.replace('\xa0','') for span in soup.find_all('span')) | |
# report summary regexp: r'Top \d{1,3} Remediations by Risk\n(?<report_date>\w+ \d{1,2}, \d{4}).+\n(?<report_name>.+)\n(?<report_summary>(Applying\n\d{1,3}\nRemediations\nWill Remediate\n\d{1,3}(\.\d)*%\n\d{1,3}(\.\d)*%\nVulnerabilities\nRisk\nAffecting\n\d{1,3}\nAssets\n\d{1,3}(\.\d)*%\npublishedexploits\n\d{1,3}(\.\d)*%\navailablemalware kits\n))' | |
#summary_pattern = re.compile(r'Top \d{1,3} Remediations by Risk\n(?P<report_date>\w+ \d{1,2}, \d{4}).+\n(?P<report_name>.+)\n(?P<report_summary>(Applying\n\d{1,3}\nRemediations\nWill Remediate\n\d{1,3}(\.\d)*%\n\d{1,3}(\.\d)*%\nVulnerabilities\nRisk\nAffecting\n\d{1,3}\nAssets\n\d{1,3}(\.\d)*%\npublishedexploits\n\d{1,3}(\.\d)*%\navailablemalware kits\n))') | |
# impact regexp: r'(Remediation\nAssets\nVulnerabilities\nRisk\n)?(?P<remediation_priority>\d{1,4})\. (?P<remediation_title>.+)\n\d{1,5}\n.+\n.+\n.+\n(?P<remediation_impact>.+)\n' | |
page_impact_pattern = re.compile(r'(Remediation\nAssets\nVulnerabilities\nRisk\n)?(?P<remediation_priority>\d{1,4})\. (?P<remediation_title>.+)\n\d{1,5}\n.+\n.+\n.+\n(?P<remediation_impact>.+)\n') | |
page_anchor_pattern = re.compile(r'(?P<page_num>of \d{1,4})\n') | |
page_contents_pattern = re.compile(r'((?P<remediation_priority>\d{1,3})\. (?P<remediation_title>.+)\nRemediation Steps\n(?P<remediation_steps>(.+\n+)+?))*Assets\nName\nIP Address\nSite\n(?P<page_contents>(.+\n)+?)of \d{1,2}') | |
page_title_pattern = re.compile(r'(?P<remediation_priority>\d{1,3})\. (?P<remediation_title>.+\n)') | |
url_pattern = re.compile(r'(?P<url>http(s*)://.*)\n') | |
assets_pattern = re.compile(r'(?P<host_name>.+)\n(?P<ip_address>([0-9]{1,3}\.){3}[0-9]{1,3})\n(?:.+)\n') | |
report_data = [] | |
remediation_impacts = {} | |
page_num = page_anchor_pattern.search(text_content).group('page_num') | |
last_remediation_title = None | |
last_remediation_steps = None | |
last_remediation_priority = None | |
for page in text_content.split(page_num): | |
normalized_page = page + page_num | |
page_impact_search = page_impact_pattern.search(normalized_page) | |
if page_impact_search: | |
for match in re.finditer(page_impact_pattern, normalized_page): | |
remediation_title = match.group('remediation_title').lower() | |
remediation_impact = int(match.group('remediation_impact').replace(',','')) | |
remediation_impacts.update({remediation_title: remediation_impact}) | |
page_contents_search = page_contents_pattern.search(normalized_page) | |
if not page_contents_search: | |
continue | |
remediation_title = page_contents_search.group('remediation_title') | |
remediation_priority = page_contents_search.group('remediation_priority') | |
if not remediation_title and not remediation_priority: | |
remediation_title = last_remediation_title | |
remediation_priority = last_remediation_priority | |
else: | |
last_remediation_title = remediation_title | |
last_remediation_priority = remediation_priority | |
remediation_steps = None | |
potential_remediation_steps = page_contents_search.group('remediation_steps') | |
if not potential_remediation_steps: | |
remediation_steps = last_remediation_steps | |
else: | |
remediation_steps = ' '.join(potential_remediation_steps.split()) # remove uncessecary whitespace chars | |
last_remediation_steps = remediation_steps | |
page_contents = page_contents_search.group('page_contents') | |
report_data += [ | |
{ | |
'hostname': match.group('host_name'), | |
'ip address': match.group('ip_address'), | |
'remediation steps': remediation_steps, | |
'remediation': remediation_title, | |
'remediation impact': remediation_impacts[remediation_title.lower()], | |
'priority': remediation_priority, | |
} for match in re.finditer(assets_pattern, page_contents) | |
] | |
# sanity-check, compare the amount of collected remediations to the numbers presented in the first page! | |
remediations = [] | |
remediation_pattern = re.compile(r'(\d{1,3}\. (?P<remediation>.+)\n\d{1,3}\n{1,3}.+\n.+\n.+\n.+\n)') | |
for match in re.finditer(remediation_pattern, text_content): | |
if match.group('remediation').lower() not in remediations: | |
remediations.append(match.group('remediation').lower()) | |
collected_remediations = list(set([rd['remediation'].lower() for rd in report_data])) | |
try: | |
assert len(collected_remediations) == len(remediations) | |
except AssertionError: | |
print(' -- Remediations:\n'+'\n'.join(remediations)) | |
print(' -- Collected remediations:\n'+'\n'.join(collected_remediations)) | |
print("! REMEDIATION MISMATCH ! len(rems): {} - len(col_rems): {}".format(len(remediations), len(collected_remediations))) | |
exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment