Skip to content

Instantly share code, notes, and snippets.

@cartershanklin
Last active March 6, 2024 12:15
Show Gist options
  • Save cartershanklin/3c3eb6ec830be01a1f07aaaa0ea71ce0 to your computer and use it in GitHub Desktop.
Save cartershanklin/3c3eb6ec830be01a1f07aaaa0ea71ce0 to your computer and use it in GitHub Desktop.
OCI Data Flow Tutorial Example 1 using the OCI CLI
#!/usr/bin/env bash
# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
#
# This script executes Example 1 of the Data Flow Tutorial
# https://docs.cloud.oracle.com/en-us/iaas/data-flow/data-flow-tutorial/tutorial/dfs_tut_etl_java.htm#etl_with_java
#
# For more help with specific Data Flow commands, see:
# oci data-flow -h
#
# Requirements for running this script:
# - OCI CLI v2.9.1 or later (you can check this by running oci --version)
# - jq (https://stedolan.github.io/jq/) for JSON querying and manipulation of CLI output. This may be a useful utility in general
# and may help cater to scenarios which can't be wholly addressed by the --query option in the CLI
# Environment variables need to be set:
# - COMPARTMENT_ID - Your compartment OCID where you want the Data Flow Application and Run created.
# - OUTPUT_PATH - The OCI object store path for ETLed data. The bucket must exist the script will not create it.
set -e
if [[ -z "$COMPARTMENT_ID" ]]; then
echo "COMPARTMENT_ID must be defined in your environment"
exit 1
fi
if [[ -z "$OUTPUT_PATH" ]]; then
echo "OUTPUT_PATH must be defined in your environment"
exit 1
fi
echo "Creating the Data Flow Application"
{
result=$(
oci data-flow application create \
--compartment-id $COMPARTMENT_ID \
--display-name 'Data Flow Tutorial App 1 CLI' \
--driver-shape VM.Standard2.1 \
--executor-shape VM.Standard2.1 \
--num-executors 1 \
--spark-version 2.4.4 \
--file-uri oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/oow-lab-2019-java-etl-1.0-SNAPSHOT.jar \
--language Java \
--class-name convert.Convert \
--arguments '${input} ${output}' \
--parameters "input=oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/kaggle_berlin_airbnb_listings_summary.csv output=\"$OUTPUT_PATH\""
)
APPLICATION_ID=$(echo $result | jq -r '[.data] | .[].id')
} || {
echo "Could not create the Data Flow Application or get its ID."
echo $result
exit 1
}
echo "Running the Data Flow Application (ID = $APPLICATION_ID)"
{
result=$(
oci data-flow run create \
--compartment-id $COMPARTMENT_ID \
--application-id $APPLICATION_ID \
--display-name 'Data Flow Tutorial App 1 CLI'
)
RUN_ID=$(echo $result | jq -r '[.data] | .[].id')
} || {
echo "Could not run the Data Flow application"
echo $result
exit 1
}
STATUS=1
INTERVAL=15
PROBE_COUNT=0
while [ $STATUS -ne 0 ]; do
echo "Waiting $INTERVAL seconds for the Data Flow Application to Complete"
sleep $INTERVAL
{
result=$(oci data-flow run get --run-id $RUN_ID)
STATE=$(echo $result | jq -r '[.data] | .[]."lifecycle-state"')
echo "Run state of run ID $RUN_ID is $STATE"
PROBE_COUNT=$(expr $PROBE_COUNT + 1)
if [ $PROBE_COUNT -gt 100 ]; then
echo "Run is taking too long, exiting"
exit 1
fi
if [ $STATE = "ACCEPTED" -o $STATE = "IN_PROGRESS" ]; then
echo "Run is in progress, waiting"
elif [ $STATE = "SUCCEEDED" ]; then
echo "Run is finished"
STATUS=0
elif [ $STATE = "FAILED" ]; then
echo "Run failed, more information:"
echo $result | jq -r '[.data] | .[]."lifecycle-details"'
exit 1
else
echo "Unexpected state, stopping"
exit 1
fi
} || {
echo "Could not get status of the Data Flow run"
echo $result
exit 1
}
done
echo "Output of the Data Flow Run follows:"
oci data-flow run get-log --run-id $RUN_ID --name spark_application_stdout.log.gz --file -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment