Skip to content

Instantly share code, notes, and snippets.

@urimerhav
Created October 28, 2024 19:50
Show Gist options
  • Save urimerhav/4cf7bd066178170d58f78ee6b6dc752a to your computer and use it in GitHub Desktop.
Save urimerhav/4cf7bd066178170d58f78ee6b6dc752a to your computer and use it in GitHub Desktop.
Using DocuPanda Review and redacting everything that comes up
import os
import dotenv
import requests
dotenv.load_dotenv()
class PiiBoundingBox(object):
def __init__(self, x1: float, y1: float, x2: float, y2: float, page: int, pii_type: str):
self.x1 = x1 # upper left (fractional coordinate)
self.y1 = y1 # upper left (fractional coordinate)
self.x2 = x2 # lower right (fractional coordinate)
self.y2 = y2 # lower right (fractional coordinate)
self.page = page - 1 # make page 0 indexed
self.pii_type = pii_type # type of PII (account number, etc.)
def get_redactions(review_id) -> list[PiiBoundingBox]:
api_key = os.environ['DOCUPANDA_API_KEY']
url = f"https://app.docupanda.io/review?review_id={review_id}"
headers = {
"accept": "application/json",
"X-API-Key": api_key
}
response = requests.get(url, headers=headers)
payload = response.json()
redaction_list = []
for pii_payload in payload['data']['pii']:
for occurrence in pii_payload['occurrences']:
bboxes = occurrence['textSpan']['review']['boundingBoxes']
page = occurrence['textSpan']['review']['page']
for bbox in bboxes:
x1y1x2y2 = bbox
redaction_list.append(PiiBoundingBox(
x1=x1y1x2y2[0],
y1=x1y1x2y2[1],
x2=x1y1x2y2[2],
y2=x1y1x2y2[3],
page=page,
pii_type=pii_payload['type']['value']
))
return redaction_list
def redact_pdf(pdf_content: bytes, redactions: list[PiiBoundingBox]) -> bytes:
from pdf2image import convert_from_bytes
from PIL import ImageDraw
import io
if not redactions:
return pdf_content
# Convert PDF to images
images = convert_from_bytes(pdf_content)
# Group redactions by page
redactions_by_page = {}
for redaction in redactions:
page = redaction.page
if page not in redactions_by_page:
redactions_by_page[page] = []
redactions_by_page[page].append(redaction)
# Process each page image
processed_images = []
for page_number, image in enumerate(images):
# Get redactions for this page
page_redactions = redactions_by_page.get(page_number, [])
if page_redactions:
draw = ImageDraw.Draw(image)
width, height = image.size
for redaction in page_redactions:
# Coordinates are fractional; calculate actual pixel positions
x1 = int(redaction.x1 * width)
y1 = int(redaction.y1 * height)
x2 = int(redaction.x2 * width)
y2 = int(redaction.y2 * height)
# Draw black rectangle
draw.rectangle([x1, y1, x2, y2], fill='black')
processed_images.append(image)
# Convert images back to PDF
output_stream = io.BytesIO()
if processed_images:
first_image = processed_images[0]
first_image.save(output_stream, format='PDF', save_all=True, append_images=processed_images[1:])
pdf_bytes = output_stream.getvalue()
output_stream.close()
return pdf_bytes
else:
return b''
def main():
file_path = '/var/data/docupanda/data/PII (uri)/bofa_statement.pdf'
redactions = get_redactions(review_id='3ec69589')
with open(file_path, 'rb') as file:
pdf_content = file.read()
redacted_pdf = redact_pdf(pdf_content, redactions)
with open('redacted.pdf', 'wb') as file:
file.write(redacted_pdf)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment