Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anand086/411e83f2f78aa2514cb7a817ca0ce30d to your computer and use it in GitHub Desktop.
Save anand086/411e83f2f78aa2514cb7a817ca0ce30d to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglueml.transforms import EntityDetector
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import *
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node Amazon S3
AmazonS3_node1662315984254 = glueContext.create_dynamic_frame.from_options(
format_options={
"quoteChar": '"',
"withHeader": True,
"separator": ",",
"optimizePerformance": False,
},
connection_type="s3",
format="csv",
connection_options={
"paths": ["s3://learn-share-repeat-us-west-2/pii-glue/patients/raw_data/"],
"recurse": True,
},
transformation_ctx="AmazonS3_node1662315984254",
)
# Script generated for node Detect PII
entity_detector = EntityDetector()
classified_map = entity_detector.classify_columns(
AmazonS3_node1662315984254,
[
"PERSON_NAME",
"EMAIL",
"CREDIT_CARD",
"IP_ADDRESS",
"MAC_ADDRESS",
"PHONE_NUMBER",
"USA_PASSPORT_NUMBER",
"USA_SSN",
"USA_ITIN",
"BANK_ACCOUNT",
"USA_DRIVING_LICENSE",
"USA_HCPCS_CODE",
"USA_NATIONAL_DRUG_CODE",
"USA_NATIONAL_PROVIDER_IDENTIFIER",
"USA_DEA_NUMBER",
"USA_HEALTH_INSURANCE_CLAIM_NUMBER",
"USA_MEDICARE_BENEFICIARY_IDENTIFIER",
],
0.3,
0.2,
)
def maskDf(df, keys):
if not keys:
return df
df_to_mask = df.toDF()
for key in keys:
df_to_mask = df_to_mask.withColumn(key, lit("**************"))
return DynamicFrame.fromDF(df_to_mask, glueContext, "updated_masked_df")
DetectPII_node1662317206797 = maskDf(
AmazonS3_node1662315984254, list(classified_map.keys())
)
# Script generated for node Amazon S3
AmazonS3_node1662317241498 = glueContext.write_dynamic_frame.from_options(
frame=DetectPII_node1662317206797,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://learn-share-repeat-us-west-2/pii-glue/patients/glue-pii/glue_find_column_sensitive_data_redacted/",
"partitionKeys": [],
},
format_options={"compression": "snappy"},
transformation_ctx="AmazonS3_node1662317241498",
)
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment