-
-
Save pdavis20008/796d66f8221ec9b7e729276fc7d5dd0a to your computer and use it in GitHub Desktop.
Detecting and Protecting PII in AWS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # If you use this example, be sure to update the path you store it in in the Lambda module configuration | |
| import boto3 | |
| import os | |
| s3 = boto3.client('s3') | |
| comprehend = boto3.client('comprehend') | |
| def lambda_handler(event, context): | |
| # Get input file details from S3 event | |
| input_bucket = event['Records'][0]['s3']['bucket']['name'] | |
| input_key = event['Records'][0]['s3']['object']['key'] | |
| output_bucket = os.environ['OUTPUT_BUCKET'] | |
| # Read input file from S3 | |
| s3_object = s3.get_object(Bucket=input_bucket, Key=input_key) | |
| text = s3_object['Body'].read().decode('utf-8') | |
| # Detect PII using Comprehend | |
| detected = comprehend.detect_pii_entities(Text=text, LanguageCode='en') | |
| # Redaction | |
| redacted_text = text | |
| for entity in sorted(detected['Entities'], key=lambda x: x['BeginOffset'], reverse=True): | |
| if entity['Type'] != 'NAME': | |
| start = entity['BeginOffset'] | |
| end = entity['EndOffset'] | |
| redacted_text = redacted_text[:start] + '[REDACTED]' + redacted_text[end:] | |
| # Save redacted file to output folder | |
| output_file = f"redacted/{input_key.split('/')[-1]}" | |
| s3.put_object( | |
| Bucket=output_bucket, | |
| Key=output_file, | |
| Body=redacted_text, | |
| ContentType='text/csv' | |
| ) | |
| return { | |
| 'statusCode': 200, | |
| 'body': f'Redacted file saved at s3://{output_bucket}/{output_file}' | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| terraform { | |
| required_providers { | |
| aws = { | |
| source = "hashicorp/aws" | |
| version = ">= 4.0.0" | |
| } | |
| } | |
| } | |
| provider "aws" { | |
| region = "us-east-1" | |
| } | |
| locals { | |
| tags = { | |
| Purpose = "patrick-data-blog" | |
| } | |
| } | |
| data "aws_caller_identity" "current" {} | |
| # In a real-world application, apply least-privilege permissions with a bucket policy, and also preferably store the | |
| # raw and processed data in separate buckets. | |
| module "pii_bucket" { | |
| source = "terraform-aws-modules/s3-bucket/aws" | |
| version = "4.3.0" | |
| bucket = "<bucket name>" # Replace the placeholder with your desired bucket name. Must be globally unique. | |
| attach_deny_insecure_transport_policy = true | |
| force_destroy = true | |
| server_side_encryption_configuration = { | |
| rule = { | |
| apply_server_side_encryption_by_default = { | |
| sse_algorithm = "AES256" # In a real-world application, use a KMS Customer-Managed Key with least-privilege permissions. | |
| } | |
| } | |
| } | |
| versioning = { | |
| mfa_delete = false | |
| enabled = true | |
| } | |
| tags = local.tags | |
| } | |
| resource "aws_s3_bucket_notification" "pii_bucket" { | |
| bucket = module.pii_bucket.s3_bucket_id | |
| lambda_function { | |
| lambda_function_arn = module.comprehend_lambda.lambda_function_arn | |
| events = ["s3:ObjectCreated:*"] | |
| filter_prefix = "unredacted/" | |
| } | |
| depends_on = [ module.comprehend_lambda ] | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| module "comprehend_lambda" { | |
| source = "terraform-aws-modules/lambda/aws" | |
| version = "7.20.0" | |
| function_name = "comprehend_lambda" | |
| description = "Lambda function to detect PII in S3 objects" | |
| runtime = "python3.12" | |
| handler = "index.lambda_handler" | |
| attach_policy_json = true | |
| policy_json = data.aws_iam_policy_document.comprehend_lambda_policy.json | |
| create_current_version_allowed_triggers = false | |
| source_path = "${path.module}/src/comprehend_detect_lambda" # Update based on your own requirements. | |
| environment_variables = { | |
| OUTPUT_BUCKET = module.pii_bucket.s3_bucket_id | |
| } | |
| allowed_triggers = { | |
| S3 = { | |
| principal = "s3.amazonaws.com" | |
| source_arn = module.pii_bucket.s3_bucket_arn | |
| } | |
| } | |
| tags = local.tags | |
| } | |
| data "aws_iam_policy_document" "comprehend_lambda_policy" { | |
| statement { | |
| effect = "Allow" | |
| actions = [ | |
| "s3:GetObject", | |
| "s3:PutObject" | |
| ] | |
| resources = [ | |
| module.pii_bucket.s3_bucket_arn, | |
| "${module.pii_bucket.s3_bucket_arn}/*" | |
| ] | |
| } | |
| statement { | |
| effect = "Allow" | |
| actions = ["kms:*"] # In a real-world application, allow only necessary key actions. | |
| resources = ["*"] # In a real-world application, specify the KMS key ARN. | |
| } | |
| statement { | |
| effect = "Allow" | |
| actions = [ | |
| "comprehend:DetectPiiEntities" | |
| ] | |
| resources = ["*"] | |
| } | |
| } | |
| output "comprehend_lambda_role" { | |
| value = module.comprehend_lambda.lambda_role_arn | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Sample PII Generation | |
| import csv | |
| import os | |
| from faker import Faker | |
| from faker.providers import bank,address | |
| # Initialize Faker | |
| fake = Faker('en_US') | |
| fake.add_provider(bank) | |
| fake.add_provider(address) | |
| # Directory to save CSV files | |
| output_dir = './pii_data/' | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Number of CSV files to generate | |
| num_files = 100 | |
| # Number of records per CSV file | |
| num_records = 100 | |
| # Function to generate fake PII data | |
| # See Faker documentation for other providers to fit your use case | |
| def generate_pii_data(): | |
| return { | |
| 'name': fake.name(), | |
| 'ssn': fake.ssn(), | |
| 'bank_account': fake.bban(), | |
| 'birthday': fake.date_of_birth().strftime('%Y-%m-%d'), | |
| 'address': fake.address() | |
| } | |
| # Generate CSV files | |
| for i in range(num_files): | |
| file_path = os.path.join(output_dir, f'pii_data_{i+1}.csv') | |
| with open(file_path, mode='w', newline='') as file: | |
| writer = csv.DictWriter(file, fieldnames=['name', 'ssn', 'bank_account', 'birthday','address']) | |
| writer.writeheader() | |
| for _ in range(num_records): | |
| writer.writerow(generate_pii_data()) | |
| print(f'{num_files} CSV files with fake PII data have been generated in {output_dir}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment