Skip to content

Instantly share code, notes, and snippets.

@PauloMigAlmeida
Created March 25, 2024 03:26
Show Gist options
  • Save PauloMigAlmeida/5cebf3efcd0f105d73646a6a9e8cc2f3 to your computer and use it in GitHub Desktop.
Save PauloMigAlmeida/5cebf3efcd0f105d73646a6a9e8cc2f3 to your computer and use it in GitHub Desktop.
EMR Serverless Terraform provisioning
# Assuming that you have AWS cli installed and configured.
# To run:
# terraform init
# terraform plan -out terraform.tfplan
# terraform apply terraform.tfplan
provider "aws" {
region = "ap-southeast-2"
}
data "aws_caller_identity" "current" {}
data "aws_region" "current" {}
resource "aws_emrserverless_application" "cluster" {
name = "demo-cluster"
architecture = "ARM64"
release_label = "emr-6.6.0"
type = "spark"
}
resource "aws_cloudwatch_log_group" "emr" {
name = "/aws/emr-serverless/demo-cluster-logs"
retention_in_days = 7
}
resource "aws_iam_role" "emr_exec_role" {
name = "emr-exec-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "emr-serverless.amazonaws.com"
}
},
]
})
}
resource "aws_iam_role_policy" "emr_exec_role" {
name = "emr-exec-policy"
role = aws_iam_role.emr_exec_role.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = [
"s3:PutObject",
"s3:GetObject",
"s3:ListBucket",
"s3:DeleteObject"
]
Effect = "Allow"
Resource = [
aws_s3_bucket.data.arn,
aws_s3_bucket.scripts.arn,
"${aws_s3_bucket.data.arn}/*",
"${aws_s3_bucket.scripts.arn}/*",
]
},
{
Action = [
"logs:DescribeLogGroups"
],
Effect = "Allow",
Resource = [
"arn:aws:logs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:*"
]
},
{
Action = [
"logs:PutLogEvents",
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:DescribeLogStreams"
],
Effect = "Allow",
Resource = [
"${aws_cloudwatch_log_group.emr.arn}:*",
]
}
]
})
}
resource "aws_s3_bucket" "data" {
bucket = "kyuubi-issue-4458-data"
force_destroy = true
}
resource "aws_s3_bucket" "scripts" {
bucket = "kyuubi-issue-4458-scripts"
force_destroy = true
}
output "emr_cluster_application_id" {
value = aws_emrserverless_application.cluster.id
}
Assumptions:
* Your scripts (manually generated or not) will be uploaded to "s3://kyuubi-issue-4458-scripts/my_cool_etl_script.py"
* Replace <***> with the values expected.
* You can add data to be manipulated to bucket "kyuubi-issue-4458-data"
To run this script (EMR Serverless)
$ aws emr-serverless start-job-run \
--application-id <content_from_emr_cluster_application_id_output> \
--execution-role-arn arn:aws:iam::<youraccountid>:role/emr-exec-role \
--job-driver 'sparkSubmit={entryPoint=s3://kyuubi-issue-4458-scripts/my_cool_etl_script.py}' \
--configuration-overrides '{"monitoringConfiguration": {"managedPersistenceMonitoringConfiguration": {"enabled": true}, "cloudWatchLoggingConfiguration": {"enabled": true, "logGroupName": "/aws/emr-serverless/demo-cluster-logs"}}}'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment