Last active
December 11, 2023 21:04
-
-
Save DaniJG/b21170482545ad8c93874d164fd97a90 to your computer and use it in GitHub Desktop.
Github actions for databricks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: 'databricks-cli-install' | |
description: 'Install latest version of the databricks CLI' | |
runs: | |
using: "composite" | |
steps: | |
# this is the new CLI, distributed as a standalone executable that doesnt need Python: https://docs.databricks.com/dev-tools/cli/databricks-cli.html | |
- shell: bash | |
run: | | |
curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: 'databricks-job-run' | |
description: 'Run a databricks job given its name' | |
inputs: | |
jobName: | |
description: 'The name of the databricks job that will be run' | |
required: true | |
wait: | |
description: 'Whether the action should wait for the job to complete or not' | |
default: 'false' | |
waitTimeout: | |
description: 'How long to wait before timing out when waiting for the job to finish' | |
default: '20m0s' | |
runs: | |
using: "composite" | |
steps: | |
# We use the "databricks jobs list" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html | |
# in order to the the job ID, then the "databricks jobs run-now" command to trigger it | |
- shell: bash | |
env: | |
# Databricks host/token are set as "env" variables when invoking this action | |
# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval | |
DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }} | |
JOB_NAME: ${{ inputs.jobName }} | |
WAIT: ${{ inputs.wait }} | |
WAIT_TIMEOUT: ${{ inputs.waitTimeout }} | |
run: | | |
JOB_ID=$(databricks jobs list | grep -w $JOB_NAME | awk -F ' ' '{print $1}') | |
if [[ $WAIT == "true" ]]; then | |
databricks jobs run-now $JOB_ID --timeout $WAIT_TIMEOUT | |
else | |
databricks jobs run-now $JOB_ID --no-wait | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: 'databricks-pipeline-full-refresh' | |
description: 'Runs a full-refresh of a DLT pipeline given its name' | |
inputs: | |
pipelineName: | |
description: 'The name of the databricks DLT pipeline that will be reset, like "my_pipeline"' | |
required: true | |
wait: | |
# NOTE: the databricks CLI command to reset a DLT pipeline does not wait for the pipeline to finish! | |
# You can only wait until the pipeline reaches the "running" stage! After that, we would need to implement | |
# some active wait by repeatedly invoking a command like "databricks pipelines get-update" and checking its status | |
description: 'Whether the action should wait for the pipeline to get started or not' | |
default: 'false' | |
waitTimeout: | |
description: 'How long to wait before timing out when waiting for the full-refresh to reach the running stage' | |
default: '20m0s' | |
runs: | |
using: "composite" | |
steps: | |
# We use the "databricks pipelines list-pipelines" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html | |
# in order to the the pipeline ID, then the "databricks pipelies reset" command to reset it | |
- shell: bash | |
env: | |
# Databricks host/token are set as "env" variables when invoking this action | |
# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval | |
DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }} | |
PIPELINE_NAME: ${{ inputs.pipelineName }} | |
WAIT: ${{ inputs.wait }} | |
WAIT_TIMEOUT: ${{ inputs.waitTimeout }} | |
run: | | |
PIPELINE_ID=$(databricks pipelines list-pipelines | jq ".[] | select(.name==\"$PIPELINE_NAME\")" | jq -r .pipeline_id) | |
if [[ $WAIT == "true" ]]; then | |
databricks pipelines reset $PIPELINE_ID --timeout $WAIT_TIMEOUT | |
else | |
databricks pipelines reset $PIPELINE_ID --no-wait | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: 'databricks-repo-update' | |
description: 'Update a databricks repo given its path' | |
inputs: | |
databricksRepoPath: | |
description: 'The path of the databricks repo within the databricks workspace' | |
default: '/Repos/main/my-repo' | |
branch: | |
description: 'The git branch the databricks repo should be updated to' | |
default: 'main' | |
runs: | |
using: "composite" | |
steps: | |
# We use the "databricks repos update" command of the CLI, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html | |
# NOTE: If the databricks repo has dirty uncommitted files, the update will likely fail! | |
# (In the same way if you try to git pull and have a conflict with modified files in your machine) | |
- shell: bash | |
env: | |
# Databricks host/token are set as "env" variables when invoking this action | |
# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval | |
DATABRICKS_HOST: ${{ env.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ env.DATABRICKS_TOKEN }} | |
REPO_PATH: ${{ inputs.databricksRepoPath }} | |
BRANCH: ${{ inputs.branch }} | |
run: | | |
databricks repos update $REPO_PATH --branch $BRANCH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: databricks-sample-workflow | |
on: | |
# Run after changes are pushed to main | |
push: | |
branches: | |
- main | |
# Replace above with following to run the workflow when pushing/updating your branch (useful when testing the workflow) | |
# push: {} | |
# Run at midnight (UTC) a full refresh of the DLT pipeline | |
schedule: | |
- cron: '0 0 * * *' # full-refresh run at midnight | |
- cron: '0 5 * * *' # early in the morning, re-run the reconcile job. | |
# Allow the workflow to be manually triggered | |
workflow_dispatch: | |
inputs: | |
fullRefresh: | |
description: 'Run a full refresh of the pipeline' | |
required: false | |
type: boolean | |
# Only allow 1 workflow running at the same time for the same workspace-pipeline combination | |
# since databricks does not allow running a DLT pipeline if an update is already in progress | |
concurrency: my_environment-my_pipeline | |
jobs: | |
databricks-ci-cd: | |
runs-on: ubuntu-latest | |
# This is a github environment with a secret named DATABRICKS_TOKEN and an env variable named DATABRICKS_HOST | |
environment: my_environment | |
steps: | |
#---------------------------------------------- | |
# check-out repo | |
#---------------------------------------------- | |
- name: Check out repository | |
uses: actions/checkout@v3 | |
#---------------------------------------------- | |
# install databricks CLI | |
#---------------------------------------------- | |
- name: Install databricks CLI | |
uses: ./.github/actions/databricks-cli-install | |
#---------------------------------------------- | |
# update databricks repo named "main" | |
#---------------------------------------------- | |
- name: Update databricks main repo | |
uses: ./.github/actions/databricks-repo-update | |
env: | |
# Databricks host/token comes from the Github environment under which this action runs | |
# NOTE: These take precedence for the CLI authentication: https://docs.databricks.com/dev-tools/cli/databricks-cli.html#auth-eval | |
DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} | |
#---------------------------------------------- | |
# Run a full-refresh of a DLT pipeline named "my_pipeline" | |
# This only runs on the midnight cron schedule, or when manually triggered with the fullRefresh option | |
#---------------------------------------------- | |
- name: Full-refresh reconcile pipeline | |
if: github.event.schedule == '0 0 * * *' || inputs.fullRefresh | |
uses: ./.github/actions/databricks-pipeline-full-refresh | |
env: | |
DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} | |
with: | |
pipelineName: my_pipeline | |
#---------------------------------------------- | |
# trigger a databricks job named "my_job" | |
# This is skipped if running the previous full-refresh, see note below | |
#---------------------------------------------- | |
# NOTE: The job internally runs the DLT pipeline "my_pipeline" and will fail if there is already an update in progress | |
# However when running a full-refresh, the "wait" parameter of the databricks CLI only ways until | |
# the pipeline reaches the "RUNNING" state, and doesnt wait until it completes. | |
# Therefore if we tried to run the job, it would fail since the full-refresh would still be running | |
# As a workaround, we trigger a full-refresh at midnight and skip the job. A few hours later, the github workflow runs, | |
# but this time only runs the job. | |
- name: Run databricks job | |
uses: ./.github/actions/databricks-job-run | |
if: github.event.schedule != '0 0 * * *' && inputs.fullRefresh == false | |
env: | |
DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }} | |
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} | |
with: | |
jobName: my_job | |
# wait for the job to be finished. Combined with the github concurrency control, effectively | |
# queues the pipeline updates when multiple PRs are merged within a short timeframe. | |
# - The github action doesnt finish until the databricks job has finished | |
# - While one github action is running, the next one has to wait. | |
wait: "true" | |
waitTimeout: 40m0s | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample CI/CD pipeline for a databricks workspace, demonstrate how to implement a CI/CD workflow using the Github actions and the databricks CLI.
Folder structure would look like: