Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JayDoubleu/2ed4345bdc89811e5f355e9494727755 to your computer and use it in GitHub Desktop.
Save JayDoubleu/2ed4345bdc89811e5f355e9494727755 to your computer and use it in GitHub Desktop.
ansible_regex.py
import fnmatch
import os
#import re2 as re
import regex as re
import subprocess
from subprocess import PIPE, STDOUT
import argparse
import yaml
import csv
parser = argparse.ArgumentParser()
parser.add_argument("--levels", required=False, action='store_true')
args = parser.parse_args()
noalias_dumper = yaml.dumper.SafeDumper
noalias_dumper.ignore_aliases = lambda self, data: True
input_folder = "./input/"
output_folder = "./output/"
playbooks_folder = "./playbooks/"
pdfs = []
playbooks = []
plays = []
def growing(s):
parts = s.split(".")
for i in range(len(parts)):
yield 'cis_section' + ".".join(parts[:i+1])
for file in os.listdir(input_folder):
if fnmatch.fnmatch(file, '*.pdf'):
pdfs.append(file)
for file in os.listdir(playbooks_folder):
if fnmatch.fnmatch(file, '*.yml') or fnmatch.fnmatch(file, '*.yaml'):
playbooks.append(file)
for playbook in playbooks:
with open(playbooks_folder + playbook) as playbook:
playbook = yaml.load(playbook)
for play in playbook:
plays.append(play)
for pdf in pdfs:
directory = output_folder + pdf.replace('.pdf', '')
if not os.path.exists(directory):
os.makedirs(directory)
for pdf in pdfs:
print ('Processing ' + input_folder + pdf)
command = ["pdfgrep", ".", input_folder + pdf]
pdf_contents = subprocess.check_output(["pdfgrep", ".", input_folder + pdf]).decode('utf-8')
regex_titles = "^\s*((?:[0-9]+\.)+[0-9]+)\ ((?:[0-9a-zA-Z\-\/(),+\"_'\ ](?:(?:\.\ )|\.[a-zA-Z0-9])?[r'\r\n|\r|\n]{0,2})*)(?:(?:\ \.\ )|(?:\ \.*)|(?:\.{2,})|)\ ([0-9]{2,3}$)"
parsed = re.finditer(regex_titles, pdf_contents, re.MULTILINE)
for match in parsed:
task_no = re.sub('\s{2,}', ' ', match.group(1)).strip()
task_title = re.sub('\s{2,}', ' ', match.group(2)).strip()
task_page = re.sub('\s{2,}', ' ', match.group(3)).strip()
scored = ""
if '(Scored)' in task_title:
scored = 'cis_scored'
if '(Not Scored)' in task_title:
scored = 'cis_notscored'
if '(Scored)' not in task_title and '(Not Scored)' not in task_title:
scored = 'cis_unknown'
result = task_no + ',' + task_title + ',' + task_page
level = re.search('^(('+ task_no + '(?!\.d)\s))[^\S\n]*((?:[^L\d]*(?!Level\s+\d+|^[^\n\S]*\d+(?:\.\d+){2,3})\S+\s+)+)(Level\ [0-9]+)?', pdf_contents, re.MULTILINE).group(4)
if level is None and args.levels is False:
level = 'None'
if level is not None:
if 'cis_unknown' not in scored:
#print (result + ',' + level + ',' + scored)
task_title = re.sub('(\(Scored\)|\(Not Scored\)|\(L1\)|\(L2\))', '', task_title).strip()
task_title = " ".join(task_title.split())
current_play = task_no + ' - ' + task_title
default_tags = [ pdf.replace('.pdf', '').lower(), 'cis_page_' + task_page , 'cis_' + level.lower().replace(' ', '_'), scored ] + list(growing(task_no))
data = None
for play in plays:
this_play = re.sub('(^\s*((?:[0-9]+\.)+[0-9]+)\s*\-)','', play['name']).strip()
if this_play == task_title:
play['name'] = task_no + ' - ' + task_title
play['tags'] = default_tags
if data is None:
data = [play]
else:
data = data + [play]
if data is None:
data = [ { "name": current_play, "ping": None, "tags": default_tags } ]
output = output_folder + pdf.replace('.pdf', '') + '/' + pdf.replace('.pdf', '.yml')
with open(output, 'a+') as outfile:
yaml.dump(data, outfile, default_flow_style=False, width=1000, sort_keys=False, Dumper=noalias_dumper)
outfile.write('\n')
with open(output.replace('.yml', '.csv'), "a") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow([task_no,task_title,level,task_page,scored,pdf.replace('.pdf', '').lower()])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment