Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anand086/219e75e688e184827936ab8ad09059b7 to your computer and use it in GitHub Desktop.
Save anand086/219e75e688e184827936ab8ad09059b7 to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglueml.transforms import EntityDetector
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node Amazon S3
AmazonS3_node1662315984254 = glueContext.create_dynamic_frame.from_options(
format_options={"quoteChar": '"', "withHeader": True, "separator": ","},
connection_type="s3",
format="csv",
connection_options={
"paths": ["s3://learn-share-repeat-us-west-2/pii-glue/patients/raw_data/"],
"recurse": True,
},
transformation_ctx="AmazonS3_node1662315984254",
)
# Script generated for node Detect PII
entity_detector = EntityDetector()
DetectPII_node1662317206797 = entity_detector.detect(
AmazonS3_node1662315984254,
[
"PERSON_NAME",
"EMAIL",
"CREDIT_CARD",
"IP_ADDRESS",
"MAC_ADDRESS",
"PHONE_NUMBER",
"USA_PASSPORT_NUMBER",
"USA_SSN",
"USA_ITIN",
"BANK_ACCOUNT",
"USA_DRIVING_LICENSE",
"USA_HCPCS_CODE",
"USA_NATIONAL_DRUG_CODE",
"USA_NATIONAL_PROVIDER_IDENTIFIER",
"USA_DEA_NUMBER",
"USA_HEALTH_INSURANCE_CLAIM_NUMBER",
"USA_MEDICARE_BENEFICIARY_IDENTIFIER",
],
"DetectedEntities",
)
# Script generated for node Amazon S3
AmazonS3_node1662317241498 = glueContext.write_dynamic_frame.from_options(
frame=DetectPII_node1662317206797,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://learn-share-repeat-us-west-2/pii-glue/patients/glue-pii/glue_find_sensitive_data_each_row/",
"partitionKeys": [],
},
format_options={"compression": "snappy"},
transformation_ctx="AmazonS3_node1662317241498",
)
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment