Skip to content

Instantly share code, notes, and snippets.

@teacup-on-rockingchair
Last active March 9, 2023 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save teacup-on-rockingchair/70831a9aeddf20ed5bb44e6b99bcdd3e to your computer and use it in GitHub Desktop.
Save teacup-on-rockingchair/70831a9aeddf20ed5bb44e6b99bcdd3e to your computer and use it in GitHub Desktop.
Parse PCI DSS pdf generated and turn it into control template
import sys
import re
import os
import pdfplumber
import itertools
HEADER_TEMPLATE = """policy: PCI-DSS
title: Configuration Recommendations of a GNU/Linux System
id: pcidss_4
version: '4'
source: https://docs-prv.pcisecuritystandards.org/PCI%20DSS/Standard/PCI-DSS-v4_0.pdf
levels:
- id: base
controls:"""
REQ_INDEX_REGEX = '\w{0,1}[\d\.]*\d+\s'
class ComplianceControlNode:
# pattern to split a parsed description from the pdf doc to a title and notes
# intentionally ordered by length so the more specific is checked first
title_split_patterns =[ r'([\S\n\s]*)including[,]* but not limited to failure of:\s*\n',
r'([\S\n\s]*)including[,]* but not limited to:\s*\n',
r'([\S\n\s]*)[,]* including mechanisms that are:\s*\n',
r'([\S\n\s]*)[,]* that meets the following:\s*\n',
r'([\S\n\s]*)[,]* code changes are:\s*\n',
r'([\S\n\s]*)[,]* that includes:\s*\n',
r'([\S\n\s]*)[,]* and includes:\s*\n',
r'([\S\n\s]*)[,]* and include:\s*\n',
r'([\S\n\s]*)[,]* as follows:\s*\n',
r'([\S\n\s]*)[,]* including:\s*\n',
r'([\S\n\s]*)[,]* such that:\s*\n',
r'([\S\n\s]*)[,]* are:\s*\n',
r'([\S\n\s]*)[,]* is:\s*\n',
r'([\S\n\s]*):\s*\n',
r'([\S\n\s]*)PCI DSS Reference:\s*']
def __init__(self, title, root, parent_candidate):
self.id = None
if title != None:
self.id = re.match(REQ_INDEX_REGEX, title)[0].strip()
self.root = root
self.depth = 0
self.indent = ' ' # 2 spaces indentation
self.set_title(title)
self.parent = self.choose_parent( parent_candidate )
self.controls = []
self.wrap_length = 98
if self.parent != None:
self.parent.add_subcontrol(self)
self.depth = self.parent.depth + 1
# drop notes separators from from description to get title
def set_title(self, title_candidate):
if title_candidate == None:
return
for splitter in ComplianceControlNode.title_split_patterns:
title_match = re.match(splitter, title_candidate)
if title_match == None:
continue
title_candidate = title_match[1]
break
self.title = re.sub("\n", "", title_candidate.strip())
self.title = re.sub(",$", ".", self.title)
def add_subcontrol(self, node):
self.controls.append(node)
def get_id(self):
return self.id
def controls(self):
return self.controls
def is_child_of(self, parent_candidate_id):
if parent_candidate_id in self.id:
return True
return False
def choose_parent(self, parent_candidate):
while True:
if parent_candidate == None or \
parent_candidate.get_id() == None:
return self.root
if parent_candidate.get_id() and \
parent_candidate.get_id() in self.id:
return parent_candidate
parent_candidate = parent_candidate.parent
def choose_next(self, visited):
if len(self.controls) > 0 and not ( self.controls[0] in visited):
return self.controls[0]
current_idx = self.parent.controls.index(self)
if current_idx+1 < len(self.parent.controls):
return self.parent.controls[current_idx+1]
elif self.parent.get_id() == None:
return None
else:
return self.parent.choose_next(visited)
def wrap_line(self, line, indent_length):
if not line:
return ""
extra_indent = indent_length+len(" title: ")
if (len(line) + extra_indent) <= self.wrap_length:
return line
wrap_line_length = self.wrap_length - extra_indent
last_space = line[:wrap_line_length].rindex(' ')
tail = line[(last_space+1):]
line = line[:(last_space+1)]+"\n"+indent_length*" "+self.wrap_line(tail, indent_length)
return line
def print(self):
print("%s- id: Req-%s" % (self.indent*self.depth, self.id))
print("%s title: '%s'" % (self.indent*self.depth, self.wrap_line(self.title, (self.depth+2)*len(self.indent))))
print("%s levels: " % (self.indent*self.depth))
print("%s - base" % (self.indent*self.depth))
print("%s status: not applicable" % (self.indent*self.depth))
if(len(self.controls) > 0):
print("%s controls:" % (self.indent*self.depth))
else:
print("%s rules: []" % (self.indent*self.depth))
print("")
class ComplianceControlTree:
def __init__(self):
self.root = ComplianceControlNode(None, None, None)
self.last = self.root
def add_node(self, req_title):
self.last = ComplianceControlNode(req_title, self.root, self.last)
def walk(self, hook):
node = self.root
visited = []
while node != None:
hook(node)
visited.append(node)
node = node.choose_next(visited)
controlTree = ComplianceControlTree()
def extract_spec_tables(doc):
# get all tables in the doc
tbls = []
for page in doc.pages:
tbls.append(page.extract_tables())
# drop empty elements
filtered = filter(lambda el: el != [], tbls)
tbls = list(filtered)
# flatten the list of tables
ftbls = list(itertools.chain(*tbls))
return ftbls
# consider description any field bellow that is not 'Customized Approach Objective' or 'Applicability Notes' or 'Defined Approach Requirements '
def element_sub_column_name(doc_element_str):
if not doc_element_str:
return False
ignore_strings = re.compile('(?:Customized Approach Objective|Applicability Notes|Defined Approach Requirements)')
if ignore_strings.match(doc_element_str) == None:
return False
return True
# find requirement id starting point
def element_startw_with_req_idx(doc_element_str):
if not doc_element_str:
return False
rq_idx_patternt = re.compile(REQ_INDEX_REGEX)
if rq_idx_patternt.match(doc_element_str) == None:
return False
return True
# add new requirement to the list of already parsed ones
def update_list_req(list_req, req_index, doc_element):
if not doc_element:
return req_index
if element_sub_column_name(str(doc_element)) == True :
return req_index
current_idx = len(list_req)
if element_startw_with_req_idx(str(doc_element)) == True:
req_index+=1
list_req.append('')
list_req[req_index] += str(doc_element)
return req_index
if len(sys.argv) < 1:
sys.exit("Please specify path to the PDF file to be processed!")
pci_dss_pdf = os.path.realpath(sys.argv[1])
# open the input
try:
pdfdox = pdfplumber.open(pci_dss_pdf)
except:
sys.exit("Not valid PDF passed as input!")
spec_tbls = extract_spec_tables(pdfdox)
# for every table with column 'Requirements and Testing Procedures', get the column index and consider title the first field of same column starting with decimal-doted number
list_req = []
tbl_count = 0
req_index = -1
for tbl in spec_tbls:
index = -1
for el in tbl:
if( index == -1):
try:
index = el.index('Requirements and Testing Procedures')
except:
pass
else:
req_index = update_list_req(list_req, req_index, el[index])
tbl_count += 1
for reqs in list_req:
controlTree.add_node(reqs)
def print_node(node):
if node.get_id():
node.print()
else:
print(HEADER_TEMPLATE)
controlTree.walk(print_node)
@ggbecker
Copy link

In
https://gist.github.com/teacup-on-rockingchair/70831a9aeddf20ed5bb44e6b99bcdd3e#file-parse_pci_dss_standard-py-L58

you could replace \n with a space instead

self.title = re.sub("\n", " ", title_candidate.strip())

Also the extra content in the title needs to be appended after a space, otherwise phrases start right after a period for example:

9.2.4 Access to consoles in sensitive areas is restricted via locking when not in use.Physical consoles within sensitive areas cannot be used by unauthorized personnel.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment