Skip to content

Instantly share code, notes, and snippets.

@refabr1k
Created November 28, 2023 12:54
Show Gist options
  • Save refabr1k/b6266f0c5a70496b1a7ebc136bea20b4 to your computer and use it in GitHub Desktop.
Save refabr1k/b6266f0c5a70496b1a7ebc136bea20b4 to your computer and use it in GitHub Desktop.
ciscat html to excel - csv with "title, description, rationale, remediation"
from bs4 import BeautifulSoup
import csv
def extract_data_from_html(html_path):
with open(html_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
extracted_data = []
# Find all the elements for ruleTitle, description, rationale, and fixtext
rule_titles = soup.find_all(class_='ruleTitle')
descriptions = soup.find_all(class_='description')
rationales = soup.find_all(class_='rationale')
fixtexts = soup.find_all(class_='fixtext')
for i, title in enumerate(rule_titles):
# Initialize data dictionary
data = {
'Title': title.get_text(strip=True),
'Description': descriptions[i].get_text(separator=' ', strip=True) if i < len(descriptions) else '',
'Rationale': rationales[i].get_text(separator=' ', strip=True) if i < len(rationales) else '',
'FixText': '',
'Impact': ''
}
# Extract FixText and Impact
if i < len(fixtexts):
fixtext_full_text = fixtexts[i].get_text(separator=' ', strip=True)
# Find the index where "Impact:" starts
impact_index = fixtext_full_text.find('Impact:')
if impact_index != -1:
# Extract Impact text and remove it from FixText
data['Impact'] = fixtext_full_text[impact_index:].replace('Impact:', '').strip()
data['FixText'] = fixtext_full_text[:impact_index].strip()
else:
data['FixText'] = fixtext_full_text
extracted_data.append(data)
return extracted_data
# Example usage
html_path = 'findings.html' # Replace with your HTML file path
csv_file_path = 'extracted_data.csv' # Path for the output CSV file
data_to_write = extract_data_from_html(html_path)
# Write the extracted data to a CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['Title', 'Description', 'Rationale', 'FixText', 'Impact'])
writer.writeheader()
for data in data_to_write:
writer.writerow(data)
print(f"Data has been extracted and written to {csv_file_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment