Skip to content

Instantly share code, notes, and snippets.

@geoHeil
Last active September 12, 2023 16:05
Show Gist options
  • Save geoHeil/ffa54ed441590f62b4c457ca64b4fc7b to your computer and use it in GitHub Desktop.
Save geoHeil/ffa54ed441590f62b4c457ca64b4fc7b to your computer and use it in GitHub Desktop.
pulumi databricks
"""An AWS Python Pulumi program to set up all the infrastructure which is required
- Databricks
- S3 buckets
We use the default Databricks managed VPC
"""
import json
from pathlib import Path
import pulumi
from jinja2 import Environment, FileSystemLoader
from pulumi_aws_native import iam, s3
from pulumi_databricks import MwsCredentials, MwsStorageConfigurations, MwsWorkspaces
aws_config = pulumi.Config("aws-native")
config = pulumi.Config()
prefix = config.require("prefix")
our_env = config.require("environment")
bucket_results = s3.Bucket(f"{prefix}-results")
bucket_databricks_root = s3.Bucket(f"{prefix}-root")
db_account_id = config.require("db_account_id")
databricks_aws_account_id = config.require("databricks_aws_account_id")
region = aws_config.require("region")
tpl_path = Path("files")
env = Environment(loader=FileSystemLoader(tpl_path))
cross_account_role_tpl = env.get_template("iam-db-cross-account-role.json.tpl")
variables = {
"db_official_databricks_aws_account": databricks_aws_account_id,
"db_account_id": db_account_id,
}
cross_account_role_tpl_r = cross_account_role_tpl.render(**variables)
access_policy = json.loads(
env.get_template("databricks_deployment_policy.json").render()
)
cross_account_role = iam.Role(
f"{prefix}-cross-account-role",
assume_role_policy_document=cross_account_role_tpl_r,
)
databricks_default_policy = iam.RolePolicy(
'databricks-deployment-p',
role_name=cross_account_role.role_name,
policy_document=access_policy,
policy_name="databricks-deployment-p"
)
def add_s3_policy(role_arn, bucket_name):
print(role_arn, bucket_name)
vars = {
"role_arn": role_arn,
"bucket_name": bucket_name,
}
s3_tpl = env.get_template("databricks_s3_permissions.json.tpl").render(**vars)
s3_tpl = json.loads(s3_tpl)
return iam.RolePolicy(
'databricks-s3-access',
role_name=cross_account_role.role_name,
policy_document=s3_tpl,
policy_name="databricks-s3-access"
)
wrapped_outputs = pulumi.Output.all(cross_account_role.role_name, bucket_databricks_root.bucket_name)
s3_applied_policy = wrapped_outputs.apply(lambda x: add_s3_policy(x[0], x[1]))
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"AWS": "{{ role_arn }}"
},
"Action": [
"s3:ListBucket",
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:PutBucketOwnerControl"
],
"Resource": [
"arn:aws:s3:::{{ bucket_name }}",
"arn:aws:s3:::{{ bucket_name }}/*"
]
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "Stmt1403287045000",
"Effect": "Allow",
"Action": [
"ec2:AllocateAddress",
"ec2:AssignPrivateIpAddresses",
"ec2:AssociateDhcpOptions",
"ec2:AssociateIamInstanceProfile",
"ec2:AssociateRouteTable",
"ec2:AttachInternetGateway",
"ec2:AttachVolume",
"ec2:AuthorizeSecurityGroupEgress",
"ec2:AuthorizeSecurityGroupIngress",
"ec2:CancelSpotInstanceRequests",
"ec2:CreateDhcpOptions",
"ec2:CreateFleet",
"ec2:CreateInternetGateway",
"ec2:CreateLaunchTemplate",
"ec2:CreateLaunchTemplateVersion",
"ec2:CreateNatGateway",
"ec2:CreateRoute",
"ec2:CreateRouteTable",
"ec2:CreateSecurityGroup",
"ec2:CreateSubnet",
"ec2:CreateTags",
"ec2:CreateVolume",
"ec2:CreateVpc",
"ec2:CreateVpcEndpoint",
"ec2:DeleteDhcpOptions",
"ec2:DeleteFleets",
"ec2:DeleteInternetGateway",
"ec2:DeleteLaunchTemplate",
"ec2:DeleteLaunchTemplateVersions",
"ec2:DeleteNatGateway",
"ec2:DeleteRoute",
"ec2:DeleteRouteTable",
"ec2:DeleteSecurityGroup",
"ec2:DeleteSubnet",
"ec2:DeleteTags",
"ec2:DeleteVolume",
"ec2:DeleteVpc",
"ec2:DeleteVpcEndpoints",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeFleetHistory",
"ec2:DescribeFleetInstances",
"ec2:DescribeFleets",
"ec2:DescribeIamInstanceProfileAssociations",
"ec2:DescribeInstanceStatus",
"ec2:DescribeInstances",
"ec2:DescribeInternetGateways",
"ec2:DescribeLaunchTemplates",
"ec2:DescribeLaunchTemplateVersions",
"ec2:DescribeNatGateways",
"ec2:DescribePrefixLists",
"ec2:DescribeReservedInstancesOfferings",
"ec2:DescribeRouteTables",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSpotInstanceRequests",
"ec2:DescribeSpotPriceHistory",
"ec2:DescribeSubnets",
"ec2:DescribeVolumes",
"ec2:DescribeVpcs",
"ec2:DetachInternetGateway",
"ec2:DisassociateIamInstanceProfile",
"ec2:DisassociateRouteTable",
"ec2:GetLaunchTemplateData",
"ec2:GetSpotPlacementScores",
"ec2:ModifyFleet",
"ec2:ModifyLaunchTemplate",
"ec2:ModifyVpcAttribute",
"ec2:ReleaseAddress",
"ec2:ReplaceIamInstanceProfileAssociation",
"ec2:RequestSpotInstances",
"ec2:RevokeSecurityGroupEgress",
"ec2:RevokeSecurityGroupIngress",
"ec2:RunInstances",
"ec2:TerminateInstances"
],
"Resource": [
"*"
]
},
{
"Effect": "Allow",
"Action": [
"iam:CreateServiceLinkedRole",
"iam:PutRolePolicy"
],
"Resource": "arn:aws:iam::*:role/aws-service-role/spot.amazonaws.com/AWSServiceRoleForEC2Spot",
"Condition": {
"StringLike": {
"iam:AWSServiceName": "spot.amazonaws.com"
}
}
}
]
}
@geoHeil
Copy link
Author

geoHeil commented Sep 12, 2023

The

files_databricks_deployment_policy.json

is directly sourced from the databricks documentation https://docs.databricks.com/en/administration-guide/account-settings-e2/credentials.html#option-1-default-deployment-policy - apparently, their default SaaS Databricks managed VPC.

It does not contain any S3 References.

The file

files__databricks_s3_permissions.json.tpl

contains the reference to a specific bucket (which is created by Pulumi) but I do not know how to feed the existing bucket via apply over properly. Currently, The script is failing with:

The role policy with the name databricks-s3-access cannot be found. 

as I am somehow mis-attaching the policies during the apply hell.

@geoHeil
Copy link
Author

geoHeil commented Sep 12, 2023

I was able to feed:

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:ListBucket",
                "s3:GetObject",
                "s3:PutObject",
                "s3:DeleteObject",
                "s3:PutBucketOwnerControl"
            ],
            "Resource": [
                "arn:aws:s3:::{{ bucket_name }}",
                "arn:aws:s3:::{{ bucket_name }}/*"
            ]
        }
    ]
}

I still am very curious to learn how it works in a better way.

However, ideally, I can also figure out how:

cannot create mws workspaces: MALFORMED_REQUEST: Failed storage configuration validation checks: List,Put,PutWithBucketOwnerFullControl,Delete

is fixed. I had hoped that feeding the policy would solve this as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment