Skip to content

Instantly share code, notes, and snippets.

@aaronfranco
Created April 16, 2020 01:35
Show Gist options
  • Save aaronfranco/77640159f0ea00737cb884757dde4697 to your computer and use it in GitHub Desktop.
Save aaronfranco/77640159f0ea00737cb884757dde4697 to your computer and use it in GitHub Desktop.
Sensitive Data Lake Blog
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
glueContext = GlueContext(SparkContext.getOrCreate())
# Data Catalog: database and table name
db_name = "db1"
tbl_name = "cards"
# S3 location for output
# Swap "<yourbucket>" with the name of your bucket
output_dir = "s3://<yourbucket>/clean_cards"
# Read data into a DynamicFrame using the Data Catalog metadata
cards_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)
cards_dataframe = cards_dyf.toDF()
# udf returns the last 4 digits of the card and deals with variance in numeric spacing by removing whitespace
@udf(returnType=StringType())
def last_four(name):
name = name.replace(" ", "")
name = name[-4:]
return name
cards_dataframe = cards_dataframe.withColumn("card_number", last_four(cards_dataframe["card_number"]))
clean_tmp_dyf = DynamicFrame.fromDF(cards_dataframe, glueContext, "clean")
glueContext.write_dynamic_frame.from_options(
frame = clean_tmp_dyf,
connection_type = "s3",
connection_options = {"path": output_dir},
format = "csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment