Created
November 22, 2023 12:15
-
-
Save defensivedepth/50da16f165968f70f611eea74c2fe693 to your computer and use it in GitHub Desktop.
CIS-Benchmark-Diff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script takes two CIS Benchmark PDFs as input and diffs them | |
# For example: It will generate a diff of the Win10 & W11 benchmarks | |
import fitz # PyMuPDF | |
import re | |
import difflib | |
import sys | |
from datetime import datetime | |
def is_start_of_new_item(line): | |
""" | |
Check if a line starts with a number pattern like '1', '1.1', up to '100.7.32'. | |
""" | |
return bool(re.match(r'\d{1,3}(?:\.\d{1,2}){0,2}', line.strip())) | |
def remove_trailing_whitespace(text): | |
""" | |
Remove trailing whitespace from each line in the text. | |
""" | |
return '\n'.join(line.rstrip() for line in text.split('\n')) | |
def correct_word_wrapping(text): | |
""" | |
Correct word wrapping issues in the extracted text. | |
Each line should start with a number pattern from '1' to '100.7.32'. | |
""" | |
lines = text.split('\n') | |
corrected_lines = [] | |
for line in lines: | |
if corrected_lines and not is_start_of_new_item(line): | |
# Append this line to the previous one | |
corrected_lines[-1] += ' ' + line | |
else: | |
corrected_lines.append(line) | |
return '\n'.join(corrected_lines) | |
def extract_recommendations_fitz(pdf_path, start_phrase, end_phrase): | |
""" | |
Extract a specific section from a PDF file. | |
""" | |
doc = fitz.open(pdf_path) | |
recommendations = "" | |
capture = False | |
for page in doc: | |
text_blocks = page.get_text("blocks") | |
for block in text_blocks: | |
block_text = block[4].strip() # Extract text from the block | |
if block_text: | |
# Check for the start and end of the section | |
if start_phrase in block_text and not capture: | |
capture = True | |
elif end_phrase in block_text and capture: | |
capture = False | |
break | |
if capture: | |
recommendations += block_text + "\n" | |
# Cleanup process | |
recommendations_cleaned = re.sub(r'Page\s+\d{1,3}', '', recommendations) # Remove "Page <number>" lines | |
recommendations_cleaned = re.sub(r'\.{2,}\s*\d+', '', recommendations_cleaned) # Remove periods followed by page numbers | |
recommendations_cleaned = re.sub(r'\s+\d{2,4}\s*$', '', recommendations_cleaned, flags=re.MULTILINE) # Remove 2 to 4 digit numbers at the end of lines | |
recommendations_corrected = correct_word_wrapping(recommendations_cleaned) # Correct word wrapping | |
final_recommendations = remove_trailing_whitespace(recommendations_corrected) # Remove trailing whitespace | |
return final_recommendations | |
def create_custom_diff(text1, text2): | |
""" | |
Create a custom diff of two texts with custom labels. | |
""" | |
text1_lines = text1.splitlines() | |
text2_lines = text2.splitlines() | |
# Generate a diff without additional context lines | |
diff = difflib.unified_diff(text1_lines, text2_lines, lineterm='', | |
fromfile='file1', tofile='file2', | |
n=0) # 'n=0' for no context lines | |
# Customizing diff output to replace '+' and '-' with 'file1' and 'file2' | |
custom_diff = [] | |
for line in diff: | |
if line.startswith('-'): | |
custom_diff.append('file1: ' + line[1:]) | |
elif line.startswith('+'): | |
custom_diff.append('file2: ' + line[1:]) | |
else: | |
custom_diff.append(line) | |
return '\n'.join(custom_diff) | |
def main(file1, file2): | |
# Start and end phrases for the extraction | |
start_phrase = "Recommendations ..." | |
end_phrase = "Appendix: Summary Table ..." | |
# Extract recommendations from both PDFs | |
recommendations_file1 = extract_recommendations_fitz(file1, start_phrase, end_phrase) | |
recommendations_file2 = extract_recommendations_fitz(file2, start_phrase, end_phrase) | |
# Write the cleaned and corrected data to a file | |
with open('cleaned.txt', 'w') as file: | |
file.write("Cleaned Data from file 1 PDF:\n\n") | |
file.write(recommendations_file1) | |
file.write("\n\nCleaned Data from file 2 PDF:\n\n") | |
file.write(recommendations_file2) | |
print("Cleaned data file created: cleaned.txt") | |
# Perform the custom diff | |
diff_result = create_custom_diff(recommendations_file1, recommendations_file2) | |
# Write the diff result to a file with a timestamp | |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
with open('cis_benchmarks_diff.txt', 'w') as file: | |
file.write(f"Diff generated on: {timestamp}\n\n") | |
file.write(diff_result) | |
print("Diff file created: cis_benchmarks_diff.txt") | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python script.py <path_to_cis_benchmark_1_pdf> <path_to_cis_benchmark_2_pdf>") | |
sys.exit(1) | |
file1 = sys.argv[1] | |
file2 = sys.argv[2] | |
main(file1, file2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment