DaniJG/.github_actions_databricks-cli-install_action.yaml

## .github_actions_databricks-cli-install_action.yaml
name: 'databricks-cli-install'
description: 'Install latest version of the databricks CLI'
runs:
  using: "composite"
  steps:
    # this is the new CLI, distributed as a standalone executable that doesnt need Python: https://docs.databricks.com/dev-tools/cli/databricks-cli.html
    - shell: bash
      run: |
          curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh

## .github_actions_databricks-job-run_action.yaml
name: 'databricks-job-run'
description: 'Run a databricks job given its name'
inputs:
  jobName:
    description: 'The name of the databricks job that will be run'
    required: true
  wait:
    description: 'Whether the action should wait for the job to complete or not'
    default: 'false'
  waitTimeout:
    description: 'How long to wait before timing out when waiting for the job to finish'
    default: '20m0s'
runs:
  using: "composite"
  steps:
    # We use the "databricks jobs list" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
    # in order to the the job ID, then the "databricks jobs run-now" command to trigger it
    - shell: bash
      env:
        # Databricks host/token are set as "env" variables when invoking this action
        # NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
        DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
        DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
        JOB_NAME: ${{ inputs.jobName }}
        WAIT: ${{ inputs.wait }}
        WAIT_TIMEOUT: ${{ inputs.waitTimeout }}
      run: |
        JOB_ID=$(databricks jobs list | grep -w $JOB_NAME | awk -F ' ' '{print $1}')
        if [[ $WAIT == "true" ]]; then
          databricks jobs run-now $JOB_ID --timeout $WAIT_TIMEOUT
        else
          databricks jobs run-now $JOB_ID --no-wait
        fi

## .github_actions_databricks-pipeline-full-refresh_action.yml
name: 'databricks-pipeline-full-refresh'
description: 'Runs a full-refresh of a DLT pipeline given its name'
inputs:
  pipelineName:
    description: 'The name of the databricks DLT pipeline that will be reset, like "my_pipeline"'
    required: true
  wait:
    # NOTE: the databricks CLI command to reset a DLT pipeline does not wait for the pipeline to finish!
    #       You can only wait until the pipeline reaches the "running" stage! After that, we would need to implement
    #       some active wait by repeatedly invoking a command like "databricks pipelines get-update" and checking its status
    description: 'Whether the action should wait for the pipeline to get started or not'
    default: 'false'
  waitTimeout:
    description: 'How long to wait before timing out when waiting for the full-refresh to reach the running stage'
    default: '20m0s'
runs:
  using: "composite"
  steps:
    # We use the "databricks pipelines list-pipelines" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
    # in order to the the pipeline ID, then the "databricks pipelies reset" command to reset it
    - shell: bash
      env:
        # Databricks host/token are set as "env" variables when invoking this action
        # NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
        DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
        DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
        PIPELINE_NAME: ${{ inputs.pipelineName }}
        WAIT: ${{ inputs.wait }}
        WAIT_TIMEOUT: ${{ inputs.waitTimeout }}
      run: |
        PIPELINE_ID=$(databricks pipelines list-pipelines | jq ".[] | select(.name==\"$PIPELINE_NAME\")" | jq -r .pipeline_id)
        if [[ $WAIT == "true" ]]; then
          databricks pipelines reset $PIPELINE_ID --timeout $WAIT_TIMEOUT
        else
          databricks pipelines reset $PIPELINE_ID --no-wait
        fi

## .github_actions_databricks-repo-update_action.yml
name: 'databricks-repo-update'
description: 'Update a databricks repo given its path'
inputs:
  databricksRepoPath:
    description: 'The path of the databricks repo within the databricks workspace'
    default: '/Repos/main/my-repo'
  branch:
    description: 'The git branch the databricks repo should be updated to'
    default: 'main'
runs:
  using: "composite"
  steps:
    # We use the "databricks repos update" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
    # NOTE: If the databricks repo has dirty uncommitted files, the update will likely fail!
    #       (In the same way if you try to git pull and have a conflict with modified files in your machine)
    - shell: bash
      env:
        # Databricks host/token are set as "env" variables when invoking this action
        # NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
        DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
        DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
        REPO_PATH: ${{ inputs.databricksRepoPath }}
        BRANCH: ${{ inputs.branch }}
      run: |
        databricks repos update $REPO_PATH --branch $BRANCH

## .github_workflows_sample-databricks-workflow.yaml
name: databricks-sample-workflow
on:

  # Run after changes are pushed to main
  push:
    branches:
      - main

  # Replace above with following to run the workflow when pushing/updating your branch (useful when testing the workflow)
  # push: {}

  # Run at midnight (UTC) a full refresh of the DLT pipeline
  schedule:
    - cron: '0 0 * * *' # full-refresh run at midnight
    - cron: '0 5 * * *' # early in the morning, re-run the reconcile job.

  # Allow the workflow to be manually triggered
  workflow_dispatch:
    inputs:
      fullRefresh:
        description: 'Run a full refresh of the pipeline'
        required: false
        type: boolean

# Only allow 1 workflow running at the same time for the same workspace-pipeline combination
# since databricks does not allow running a DLT pipeline if an update is already in progress
concurrency: my_environment-my_pipeline

jobs:

  databricks-ci-cd:
    runs-on: ubuntu-latest
    # This is a github environment with a secret named DATABRICKS_TOKEN and an env variable named DATABRICKS_HOST
    environment: my_environment
    steps:

        #----------------------------------------------
        # check-out repo
        #----------------------------------------------
        - name: Check out repository
          uses: actions/checkout@v3

        #----------------------------------------------
        # install databricks CLI
        #----------------------------------------------
        - name: Install databricks CLI
          uses: ./.github/actions/databricks-cli-install

        #----------------------------------------------
        # update databricks repo named "main"
        #----------------------------------------------
        - name: Update databricks main repo
          uses: ./.github/actions/databricks-repo-update
          env:
            # Databricks host/token comes from the Github environment under which this action runs
            # NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
            DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
            DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}

        #----------------------------------------------
        # Run a full-refresh of a DLT pipeline named "my_pipeline"
        # This only runs on the midnight cron schedule, or when manually triggered with the fullRefresh option
        #----------------------------------------------
        - name: Full-refresh reconcile pipeline
          if: github.event.schedule == '0 0 * * *' || inputs.fullRefresh
          uses: ./.github/actions/databricks-pipeline-full-refresh
          env:
            DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
            DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
          with:
            pipelineName: my_pipeline

        #----------------------------------------------
        # trigger a databricks job named "my_job"
        # This is skipped if running the previous full-refresh, see note below
        #----------------------------------------------
        # NOTE: The job internally runs the DLT pipeline "my_pipeline" and will fail if there is already an update in progress
        #       However when running a full-refresh, the "wait" parameter of the databricks CLI only ways until
        #       the pipeline reaches the "RUNNING" state, and doesnt wait until it completes.
        #       Therefore if we tried to run the job, it would fail since the full-refresh would still be running
        #       As a workaround, we trigger a full-refresh at midnight and skip the job. A few hours later, the github workflow runs,
        #       but this time only runs the job.
        - name: Run databricks job
          uses: ./.github/actions/databricks-job-run
          if: github.event.schedule != '0 0 * * *' && inputs.fullRefresh == false
          env:
            DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
            DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
          with:
            jobName: my_job
            # wait for the job to be finished. Combined with the github concurrency control, effectively
            # queues the pipeline updates when multiple PRs are merged within a short timeframe.
            #   - The github action doesnt finish until the databricks job has finished
            #   - While one github action is running, the next one has to wait.
            wait: "true"
            waitTimeout: 40m0s
	name: 'databricks-cli-install'
	description: 'Install latest version of the databricks CLI'
	runs:
	using: "composite"
	steps:
	# this is the new CLI, distributed as a standalone executable that doesnt need Python: https://docs.databricks.com/dev-tools/cli/databricks-cli.html
	- shell: bash
	run: \|
	curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh \| sh
	name: 'databricks-job-run'
	description: 'Run a databricks job given its name'
	inputs:
	jobName:
	description: 'The name of the databricks job that will be run'
	required: true
	wait:
	description: 'Whether the action should wait for the job to complete or not'
	default: 'false'
	waitTimeout:
	description: 'How long to wait before timing out when waiting for the job to finish'
	default: '20m0s'
	runs:
	using: "composite"
	steps:
	# We use the "databricks jobs list" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
	# in order to the the job ID, then the "databricks jobs run-now" command to trigger it
	- shell: bash
	env:
	# Databricks host/token are set as "env" variables when invoking this action
	# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
	DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
	JOB_NAME: ${{ inputs.jobName }}
	WAIT: ${{ inputs.wait }}
	WAIT_TIMEOUT: ${{ inputs.waitTimeout }}
	run: \|
	JOB_ID=$(databricks jobs list \| grep -w $JOB_NAME \| awk -F ' ' '{print $1}')
	if [[ $WAIT == "true" ]]; then
	databricks jobs run-now $JOB_ID --timeout $WAIT_TIMEOUT
	else
	databricks jobs run-now $JOB_ID --no-wait
	fi
	name: 'databricks-pipeline-full-refresh'
	description: 'Runs a full-refresh of a DLT pipeline given its name'
	inputs:
	pipelineName:
	description: 'The name of the databricks DLT pipeline that will be reset, like "my_pipeline"'
	required: true
	wait:
	# NOTE: the databricks CLI command to reset a DLT pipeline does not wait for the pipeline to finish!
	# You can only wait until the pipeline reaches the "running" stage! After that, we would need to implement
	# some active wait by repeatedly invoking a command like "databricks pipelines get-update" and checking its status
	description: 'Whether the action should wait for the pipeline to get started or not'
	default: 'false'
	waitTimeout:
	description: 'How long to wait before timing out when waiting for the full-refresh to reach the running stage'
	default: '20m0s'
	runs:
	using: "composite"
	steps:
	# We use the "databricks pipelines list-pipelines" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
	# in order to the the pipeline ID, then the "databricks pipelies reset" command to reset it
	- shell: bash
	env:
	# Databricks host/token are set as "env" variables when invoking this action
	# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
	DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
	PIPELINE_NAME: ${{ inputs.pipelineName }}
	WAIT: ${{ inputs.wait }}
	WAIT_TIMEOUT: ${{ inputs.waitTimeout }}
	run: \|
	PIPELINE_ID=$(databricks pipelines list-pipelines \| jq ".[] \| select(.name==\"$PIPELINE_NAME\")" \| jq -r .pipeline_id)
	if [[ $WAIT == "true" ]]; then
	databricks pipelines reset $PIPELINE_ID --timeout $WAIT_TIMEOUT
	else
	databricks pipelines reset $PIPELINE_ID --no-wait
	fi
	name: 'databricks-repo-update'
	description: 'Update a databricks repo given its path'
	inputs:
	databricksRepoPath:
	description: 'The path of the databricks repo within the databricks workspace'
	default: '/Repos/main/my-repo'
	branch:
	description: 'The git branch the databricks repo should be updated to'
	default: 'main'
	runs:
	using: "composite"
	steps:
	# We use the "databricks repos update" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html
	# NOTE: If the databricks repo has dirty uncommitted files, the update will likely fail!
	# (In the same way if you try to git pull and have a conflict with modified files in your machine)
	- shell: bash
	env:
	# Databricks host/token are set as "env" variables when invoking this action
	# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
	DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }}
	REPO_PATH: ${{ inputs.databricksRepoPath }}
	BRANCH: ${{ inputs.branch }}
	run: \|
	databricks repos update $REPO_PATH --branch $BRANCH
	name: databricks-sample-workflow
	on:

	# Run after changes are pushed to main
	push:
	branches:
	- main

	# Replace above with following to run the workflow when pushing/updating your branch (useful when testing the workflow)
	# push: {}

	# Run at midnight (UTC) a full refresh of the DLT pipeline
	schedule:
	- cron: '0 0 * * *' # full-refresh run at midnight
	- cron: '0 5 * * *' # early in the morning, re-run the reconcile job.

	# Allow the workflow to be manually triggered
	workflow_dispatch:
	inputs:
	fullRefresh:
	description: 'Run a full refresh of the pipeline'
	required: false
	type: boolean

	# Only allow 1 workflow running at the same time for the same workspace-pipeline combination
	# since databricks does not allow running a DLT pipeline if an update is already in progress
	concurrency: my_environment-my_pipeline

	jobs:

	databricks-ci-cd:
	runs-on: ubuntu-latest
	# This is a github environment with a secret named DATABRICKS_TOKEN and an env variable named DATABRICKS_HOST
	environment: my_environment
	steps:

	#----------------------------------------------
	# check-out repo
	#----------------------------------------------
	- name: Check out repository
	uses: actions/checkout@v3

	#----------------------------------------------
	# install databricks CLI
	#----------------------------------------------
	- name: Install databricks CLI
	uses: ./.github/actions/databricks-cli-install

	#----------------------------------------------
	# update databricks repo named "main"
	#----------------------------------------------
	- name: Update databricks main repo
	uses: ./.github/actions/databricks-repo-update
	env:
	# Databricks host/token comes from the Github environment under which this action runs
	# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval
	DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}

	#----------------------------------------------
	# Run a full-refresh of a DLT pipeline named "my_pipeline"
	# This only runs on the midnight cron schedule, or when manually triggered with the fullRefresh option
	#----------------------------------------------
	- name: Full-refresh reconcile pipeline
	if: github.event.schedule == '0 0 * * *' \|\| inputs.fullRefresh
	uses: ./.github/actions/databricks-pipeline-full-refresh
	env:
	DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
	with:
	pipelineName: my_pipeline

	#----------------------------------------------
	# trigger a databricks job named "my_job"
	# This is skipped if running the previous full-refresh, see note below
	#----------------------------------------------
	# NOTE: The job internally runs the DLT pipeline "my_pipeline" and will fail if there is already an update in progress
	# However when running a full-refresh, the "wait" parameter of the databricks CLI only ways until
	# the pipeline reaches the "RUNNING" state, and doesnt wait until it completes.
	# Therefore if we tried to run the job, it would fail since the full-refresh would still be running
	# As a workaround, we trigger a full-refresh at midnight and skip the job. A few hours later, the github workflow runs,
	# but this time only runs the job.
	- name: Run databricks job
	uses: ./.github/actions/databricks-job-run
	if: github.event.schedule != '0 0 * * *' && inputs.fullRefresh == false
	env:
	DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
	DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
	with:
	jobName: my_job
	# wait for the job to be finished. Combined with the github concurrency control, effectively
	# queues the pipeline updates when multiple PRs are merged within a short timeframe.
	# - The github action doesnt finish until the databricks job has finished
	# - While one github action is running, the next one has to wait.
	wait: "true"
	waitTimeout: 40m0s