Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anand086/2e525bd57ed3ef64632eca59a5b431a0 to your computer and use it in GitHub Desktop.
Save anand086/2e525bd57ed3ef64632eca59a5b431a0 to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglueml.transforms import EntityDetector
from pyspark.sql.types import MapType, StringType, StructType, StructField
from awsglue.dynamicframe import DynamicFrame
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node Amazon S3
AmazonS3_node1662315984254 = glueContext.create_dynamic_frame.from_options(
format_options={"quoteChar": '"', "withHeader": True, "separator": ","},
connection_type="s3",
format="csv",
connection_options={
"paths": ["s3://learn-share-repeat-us-west-2/pii-glue/patients/raw_data/"],
"recurse": True,
},
transformation_ctx="AmazonS3_node1662315984254",
)
# Script generated for node Detect PII
entity_detector = EntityDetector()
classified_map = entity_detector.classify_columns(
AmazonS3_node1662315984254,
[
"PERSON_NAME",
"EMAIL",
"CREDIT_CARD",
"IP_ADDRESS",
"MAC_ADDRESS",
"PHONE_NUMBER",
"USA_PASSPORT_NUMBER",
"USA_SSN",
"USA_ITIN",
"BANK_ACCOUNT",
"USA_DRIVING_LICENSE",
"USA_HCPCS_CODE",
"USA_NATIONAL_DRUG_CODE",
"USA_NATIONAL_PROVIDER_IDENTIFIER",
"USA_DEA_NUMBER",
"USA_HEALTH_INSURANCE_CLAIM_NUMBER",
"USA_MEDICARE_BENEFICIARY_IDENTIFIER",
],
0.3,
0.2,
)
items = classified_map.items()
schema = StructType(
[
StructField("columnName", StringType(), True),
StructField(
"entityTypes", StructType([StructField("entityType", StringType(), True)])
),
]
)
data_frame = spark.createDataFrame(data=items, schema=schema)
DetectPII_node1662317206797 = DynamicFrame.fromDF(data_frame, glueContext, "df_for_pii")
# Script generated for node Amazon S3
AmazonS3_node1662317241498 = glueContext.write_dynamic_frame.from_options(
frame=DetectPII_node1662317206797,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://learn-share-repeat-us-west-2/pii-glue/patients/glue-pii/glue_find_column_sensitive_data_output_detection/",
"partitionKeys": [],
},
format_options={"compression": "snappy"},
transformation_ctx="AmazonS3_node1662317241498",
)
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment