Skip to content

Instantly share code, notes, and snippets.

@pdavis20008
Last active January 13, 2025 18:19
Show Gist options
  • Select an option

  • Save pdavis20008/796d66f8221ec9b7e729276fc7d5dd0a to your computer and use it in GitHub Desktop.

Select an option

Save pdavis20008/796d66f8221ec9b7e729276fc7d5dd0a to your computer and use it in GitHub Desktop.
Detecting and Protecting PII in AWS
# If you use this example, be sure to update the path you store it in in the Lambda module configuration
import boto3
import os
s3 = boto3.client('s3')
comprehend = boto3.client('comprehend')
def lambda_handler(event, context):
# Get input file details from S3 event
input_bucket = event['Records'][0]['s3']['bucket']['name']
input_key = event['Records'][0]['s3']['object']['key']
output_bucket = os.environ['OUTPUT_BUCKET']
# Read input file from S3
s3_object = s3.get_object(Bucket=input_bucket, Key=input_key)
text = s3_object['Body'].read().decode('utf-8')
# Detect PII using Comprehend
detected = comprehend.detect_pii_entities(Text=text, LanguageCode='en')
# Redaction
redacted_text = text
for entity in sorted(detected['Entities'], key=lambda x: x['BeginOffset'], reverse=True):
if entity['Type'] != 'NAME':
start = entity['BeginOffset']
end = entity['EndOffset']
redacted_text = redacted_text[:start] + '[REDACTED]' + redacted_text[end:]
# Save redacted file to output folder
output_file = f"redacted/{input_key.split('/')[-1]}"
s3.put_object(
Bucket=output_bucket,
Key=output_file,
Body=redacted_text,
ContentType='text/csv'
)
return {
'statusCode': 200,
'body': f'Redacted file saved at s3://{output_bucket}/{output_file}'
}
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 4.0.0"
}
}
}
provider "aws" {
region = "us-east-1"
}
locals {
tags = {
Purpose = "patrick-data-blog"
}
}
data "aws_caller_identity" "current" {}
# In a real-world application, apply least-privilege permissions with a bucket policy, and also preferably store the
# raw and processed data in separate buckets.
module "pii_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
version = "4.3.0"
bucket = "<bucket name>" # Replace the placeholder with your desired bucket name. Must be globally unique.
attach_deny_insecure_transport_policy = true
force_destroy = true
server_side_encryption_configuration = {
rule = {
apply_server_side_encryption_by_default = {
sse_algorithm = "AES256" # In a real-world application, use a KMS Customer-Managed Key with least-privilege permissions.
}
}
}
versioning = {
mfa_delete = false
enabled = true
}
tags = local.tags
}
resource "aws_s3_bucket_notification" "pii_bucket" {
bucket = module.pii_bucket.s3_bucket_id
lambda_function {
lambda_function_arn = module.comprehend_lambda.lambda_function_arn
events = ["s3:ObjectCreated:*"]
filter_prefix = "unredacted/"
}
depends_on = [ module.comprehend_lambda ]
}
module "comprehend_lambda" {
source = "terraform-aws-modules/lambda/aws"
version = "7.20.0"
function_name = "comprehend_lambda"
description = "Lambda function to detect PII in S3 objects"
runtime = "python3.12"
handler = "index.lambda_handler"
attach_policy_json = true
policy_json = data.aws_iam_policy_document.comprehend_lambda_policy.json
create_current_version_allowed_triggers = false
source_path = "${path.module}/src/comprehend_detect_lambda" # Update based on your own requirements.
environment_variables = {
OUTPUT_BUCKET = module.pii_bucket.s3_bucket_id
}
allowed_triggers = {
S3 = {
principal = "s3.amazonaws.com"
source_arn = module.pii_bucket.s3_bucket_arn
}
}
tags = local.tags
}
data "aws_iam_policy_document" "comprehend_lambda_policy" {
statement {
effect = "Allow"
actions = [
"s3:GetObject",
"s3:PutObject"
]
resources = [
module.pii_bucket.s3_bucket_arn,
"${module.pii_bucket.s3_bucket_arn}/*"
]
}
statement {
effect = "Allow"
actions = ["kms:*"] # In a real-world application, allow only necessary key actions.
resources = ["*"] # In a real-world application, specify the KMS key ARN.
}
statement {
effect = "Allow"
actions = [
"comprehend:DetectPiiEntities"
]
resources = ["*"]
}
}
output "comprehend_lambda_role" {
value = module.comprehend_lambda.lambda_role_arn
}
# Sample PII Generation
import csv
import os
from faker import Faker
from faker.providers import bank,address
# Initialize Faker
fake = Faker('en_US')
fake.add_provider(bank)
fake.add_provider(address)
# Directory to save CSV files
output_dir = './pii_data/'
os.makedirs(output_dir, exist_ok=True)
# Number of CSV files to generate
num_files = 100
# Number of records per CSV file
num_records = 100
# Function to generate fake PII data
# See Faker documentation for other providers to fit your use case
def generate_pii_data():
return {
'name': fake.name(),
'ssn': fake.ssn(),
'bank_account': fake.bban(),
'birthday': fake.date_of_birth().strftime('%Y-%m-%d'),
'address': fake.address()
}
# Generate CSV files
for i in range(num_files):
file_path = os.path.join(output_dir, f'pii_data_{i+1}.csv')
with open(file_path, mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=['name', 'ssn', 'bank_account', 'birthday','address'])
writer.writeheader()
for _ in range(num_records):
writer.writerow(generate_pii_data())
print(f'{num_files} CSV files with fake PII data have been generated in {output_dir}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment