Created
October 28, 2024 19:50
-
-
Save urimerhav/4cf7bd066178170d58f78ee6b6dc752a to your computer and use it in GitHub Desktop.
Using DocuPanda Review and redacting everything that comes up
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import dotenv | |
import requests | |
dotenv.load_dotenv() | |
class PiiBoundingBox(object): | |
def __init__(self, x1: float, y1: float, x2: float, y2: float, page: int, pii_type: str): | |
self.x1 = x1 # upper left (fractional coordinate) | |
self.y1 = y1 # upper left (fractional coordinate) | |
self.x2 = x2 # lower right (fractional coordinate) | |
self.y2 = y2 # lower right (fractional coordinate) | |
self.page = page - 1 # make page 0 indexed | |
self.pii_type = pii_type # type of PII (account number, etc.) | |
def get_redactions(review_id) -> list[PiiBoundingBox]: | |
api_key = os.environ['DOCUPANDA_API_KEY'] | |
url = f"https://app.docupanda.io/review?review_id={review_id}" | |
headers = { | |
"accept": "application/json", | |
"X-API-Key": api_key | |
} | |
response = requests.get(url, headers=headers) | |
payload = response.json() | |
redaction_list = [] | |
for pii_payload in payload['data']['pii']: | |
for occurrence in pii_payload['occurrences']: | |
bboxes = occurrence['textSpan']['review']['boundingBoxes'] | |
page = occurrence['textSpan']['review']['page'] | |
for bbox in bboxes: | |
x1y1x2y2 = bbox | |
redaction_list.append(PiiBoundingBox( | |
x1=x1y1x2y2[0], | |
y1=x1y1x2y2[1], | |
x2=x1y1x2y2[2], | |
y2=x1y1x2y2[3], | |
page=page, | |
pii_type=pii_payload['type']['value'] | |
)) | |
return redaction_list | |
def redact_pdf(pdf_content: bytes, redactions: list[PiiBoundingBox]) -> bytes: | |
from pdf2image import convert_from_bytes | |
from PIL import ImageDraw | |
import io | |
if not redactions: | |
return pdf_content | |
# Convert PDF to images | |
images = convert_from_bytes(pdf_content) | |
# Group redactions by page | |
redactions_by_page = {} | |
for redaction in redactions: | |
page = redaction.page | |
if page not in redactions_by_page: | |
redactions_by_page[page] = [] | |
redactions_by_page[page].append(redaction) | |
# Process each page image | |
processed_images = [] | |
for page_number, image in enumerate(images): | |
# Get redactions for this page | |
page_redactions = redactions_by_page.get(page_number, []) | |
if page_redactions: | |
draw = ImageDraw.Draw(image) | |
width, height = image.size | |
for redaction in page_redactions: | |
# Coordinates are fractional; calculate actual pixel positions | |
x1 = int(redaction.x1 * width) | |
y1 = int(redaction.y1 * height) | |
x2 = int(redaction.x2 * width) | |
y2 = int(redaction.y2 * height) | |
# Draw black rectangle | |
draw.rectangle([x1, y1, x2, y2], fill='black') | |
processed_images.append(image) | |
# Convert images back to PDF | |
output_stream = io.BytesIO() | |
if processed_images: | |
first_image = processed_images[0] | |
first_image.save(output_stream, format='PDF', save_all=True, append_images=processed_images[1:]) | |
pdf_bytes = output_stream.getvalue() | |
output_stream.close() | |
return pdf_bytes | |
else: | |
return b'' | |
def main(): | |
file_path = '/var/data/docupanda/data/PII (uri)/bofa_statement.pdf' | |
redactions = get_redactions(review_id='3ec69589') | |
with open(file_path, 'rb') as file: | |
pdf_content = file.read() | |
redacted_pdf = redact_pdf(pdf_content, redactions) | |
with open('redacted.pdf', 'wb') as file: | |
file.write(redacted_pdf) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment