cartershanklin/data_flow_tutorial_ex1.sh

## data_flow_tutorial_ex1.sh
#!/usr/bin/env bash

# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
#
# This script executes Example 1 of the Data Flow Tutorial
# https://docs.cloud.oracle.com/en-us/iaas/data-flow/data-flow-tutorial/tutorial/dfs_tut_etl_java.htm#etl_with_java
#
# For more help with specific Data Flow commands, see:
#   oci data-flow -h
#
# Requirements for running this script:
#   - OCI CLI v2.9.1 or later (you can check this by running oci --version)
#   - jq (https://stedolan.github.io/jq/) for JSON querying and manipulation of CLI output. This may be a useful utility in general
#     and may help cater to scenarios which can't be wholly addressed by the --query option in the CLI
# Environment variables need to be set:
#   - COMPARTMENT_ID - Your compartment OCID where you want the Data Flow Application and Run created.
#   - OUTPUT_PATH - The OCI object store path for ETLed data. The bucket must exist the script will not create it.

set -e

if [[ -z "$COMPARTMENT_ID" ]]; then
    echo "COMPARTMENT_ID must be defined in your environment"
    exit 1
fi
if [[ -z "$OUTPUT_PATH" ]]; then
    echo "OUTPUT_PATH must be defined in your environment"
    exit 1
fi

echo "Creating the Data Flow Application"
{
    result=$(
        oci data-flow application create \
            --compartment-id $COMPARTMENT_ID \
            --display-name 'Data Flow Tutorial App 1 CLI' \
            --driver-shape VM.Standard2.1 \
            --executor-shape VM.Standard2.1 \
            --num-executors 1 \
            --spark-version 2.4.4 \
            --file-uri oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/oow-lab-2019-java-etl-1.0-SNAPSHOT.jar \
            --language Java \
            --class-name convert.Convert \
            --arguments '${input} ${output}' \
            --parameters "input=oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/kaggle_berlin_airbnb_listings_summary.csv output=\"$OUTPUT_PATH\""
    )
    APPLICATION_ID=$(echo $result | jq -r '[.data] | .[].id')
} || {
    echo "Could not create the Data Flow Application or get its ID."
    echo $result
    exit 1
}

echo "Running the Data Flow Application (ID = $APPLICATION_ID)"
{
    result=$(
        oci data-flow run create \
            --compartment-id $COMPARTMENT_ID \
            --application-id $APPLICATION_ID \
            --display-name 'Data Flow Tutorial App 1 CLI'
    )
    RUN_ID=$(echo $result | jq -r '[.data] | .[].id')
} || {
    echo "Could not run the Data Flow application"
    echo $result
    exit 1
}

STATUS=1
INTERVAL=15
PROBE_COUNT=0
while [ $STATUS -ne 0 ]; do
    echo "Waiting $INTERVAL seconds for the Data Flow Application to Complete"
    sleep $INTERVAL
    {
        result=$(oci data-flow run get --run-id $RUN_ID)
        STATE=$(echo $result | jq -r '[.data] | .[]."lifecycle-state"')
        echo "Run state of run ID $RUN_ID is $STATE"
        PROBE_COUNT=$(expr $PROBE_COUNT + 1)
        if [ $PROBE_COUNT -gt 100 ]; then
            echo "Run is taking too long, exiting"
            exit 1
        fi
        if [ $STATE = "ACCEPTED" -o $STATE = "IN_PROGRESS" ]; then
            echo "Run is in progress, waiting"
        elif [ $STATE = "SUCCEEDED" ]; then
            echo "Run is finished"
            STATUS=0
        elif [ $STATE = "FAILED" ]; then
            echo "Run failed, more information:"
            echo $result | jq -r '[.data] | .[]."lifecycle-details"'
            exit 1
        else
            echo "Unexpected state, stopping"
            exit 1
        fi
    } || {
        echo "Could not get status of the Data Flow run"
        echo $result
        exit 1
    }
done

echo "Output of the Data Flow Run follows:"
oci data-flow run get-log --run-id $RUN_ID --name spark_application_stdout.log.gz --file -
	#!/usr/bin/env bash

	# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
	#
	# This script executes Example 1 of the Data Flow Tutorial
	# https://docs.cloud.oracle.com/en-us/iaas/data-flow/data-flow-tutorial/tutorial/dfs_tut_etl_java.htm#etl_with_java
	#
	# For more help with specific Data Flow commands, see:
	# oci data-flow -h
	#
	# Requirements for running this script:
	# - OCI CLI v2.9.1 or later (you can check this by running oci --version)
	# - jq (https://stedolan.github.io/jq/) for JSON querying and manipulation of CLI output. This may be a useful utility in general
	# and may help cater to scenarios which can't be wholly addressed by the --query option in the CLI
	# Environment variables need to be set:
	# - COMPARTMENT_ID - Your compartment OCID where you want the Data Flow Application and Run created.
	# - OUTPUT_PATH - The OCI object store path for ETLed data. The bucket must exist the script will not create it.

	set -e

	if [[ -z "$COMPARTMENT_ID" ]]; then
	echo "COMPARTMENT_ID must be defined in your environment"
	exit 1
	fi
	if [[ -z "$OUTPUT_PATH" ]]; then
	echo "OUTPUT_PATH must be defined in your environment"
	exit 1
	fi

	echo "Creating the Data Flow Application"
	{
	result=$(
	oci data-flow application create \
	--compartment-id $COMPARTMENT_ID \
	--display-name 'Data Flow Tutorial App 1 CLI' \
	--driver-shape VM.Standard2.1 \
	--executor-shape VM.Standard2.1 \
	--num-executors 1 \
	--spark-version 2.4.4 \
	--file-uri oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/oow-lab-2019-java-etl-1.0-SNAPSHOT.jar \
	--language Java \
	--class-name convert.Convert \
	--arguments '${input} ${output}' \
	--parameters "input=oci://oow_2019_dataflow_lab@bigdatadatasciencelarge/usercontent/kaggle_berlin_airbnb_listings_summary.csv output=\"$OUTPUT_PATH\""
	)
	APPLICATION_ID=$(echo $result \| jq -r '[.data] \| .[].id')
	} \|\| {
	echo "Could not create the Data Flow Application or get its ID."
	echo $result
	exit 1
	}

	echo "Running the Data Flow Application (ID = $APPLICATION_ID)"
	{
	result=$(
	oci data-flow run create \
	--compartment-id $COMPARTMENT_ID \
	--application-id $APPLICATION_ID \
	--display-name 'Data Flow Tutorial App 1 CLI'
	)
	RUN_ID=$(echo $result \| jq -r '[.data] \| .[].id')
	} \|\| {
	echo "Could not run the Data Flow application"
	echo $result
	exit 1
	}

	STATUS=1
	INTERVAL=15
	PROBE_COUNT=0
	while [ $STATUS -ne 0 ]; do
	echo "Waiting $INTERVAL seconds for the Data Flow Application to Complete"
	sleep $INTERVAL
	{
	result=$(oci data-flow run get --run-id $RUN_ID)
	STATE=$(echo $result \| jq -r '[.data] \| .[]."lifecycle-state"')
	echo "Run state of run ID $RUN_ID is $STATE"
	PROBE_COUNT=$(expr $PROBE_COUNT + 1)
	if [ $PROBE_COUNT -gt 100 ]; then
	echo "Run is taking too long, exiting"
	exit 1
	fi
	if [ $STATE = "ACCEPTED" -o $STATE = "IN_PROGRESS" ]; then
	echo "Run is in progress, waiting"
	elif [ $STATE = "SUCCEEDED" ]; then
	echo "Run is finished"
	STATUS=0
	elif [ $STATE = "FAILED" ]; then
	echo "Run failed, more information:"
	echo $result \| jq -r '[.data] \| .[]."lifecycle-details"'
	exit 1
	else
	echo "Unexpected state, stopping"
	exit 1
	fi
	} \|\| {
	echo "Could not get status of the Data Flow run"
	echo $result
	exit 1
	}
	done

	echo "Output of the Data Flow Run follows:"
	oci data-flow run get-log --run-id $RUN_ID --name spark_application_stdout.log.gz --file -