Skip to content

Instantly share code, notes, and snippets.

@MatrixManAtYrService
Last active January 3, 2023 00:52
Show Gist options
  • Save MatrixManAtYrService/737cb408e5a27c2aaa19576b0f6ec18a to your computer and use it in GitHub Desktop.
Save MatrixManAtYrService/737cb408e5a27c2aaa19576b0f6ec18a to your computer and use it in GitHub Desktop.
Run a script as a specific service account on google AI platform
FROM centos:centos7
RUN yum install -y python3 wget
# GCloud Access
RUN wget -nv \
https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \
mkdir /root/tools && \
tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \
rm google-cloud-sdk.tar.gz && \
/root/tools/google-cloud-sdk/install.sh --usage-reporting=false \
--path-update=false --bash-completion=false \
--disable-installation-options && \
rm -rf /root/.config/* && \
ln -s /root/.config /config && \
# Remove the backup directory that gcloud creates
rm -rf /root/tools/google-cloud-sdk/.install/.backup
# Path configuration
ENV PATH $PATH:/root/tools/google-cloud-sdk/bin
# Make sure gsutil will use the default service account
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg
COPY ./entrypoint /root/entrypoint
ENTRYPOINT ["/root/entrypoint"]
#! /usr/bin/env bash
# If the first parameter looks like:
# gs://some-bucket-name/foo_script
# then this entrypoint script will copy that script to /run/cmd, make it executable, and run it
# the bucket will need to be in the project you're using to launch the training job, otherwise
# you'll need to give the default AI training service account access to the bucket
# (https://stackoverflow.com/questions/58478478/how-can-i-mount-a-gcs-bucket-in-a-custom-docker-image-on-ai-platform#comment103289576_58478549)
set -euo pipefail
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
1>&2 echo "Entered Foo Container (container id: $(cat /etc/hostname))"
# if the caller has staged a script in /run/cmd, run it
runcmd() {
1>&2 echo "examining /run/cmd:"
SHEBANG='^#!.*$'
FIRST_LINE="$(head -1 /run/cmd)"
if [[ $FIRST_LINE =~ $SHEBANG ]]
then
1>&2 printf " It has a shebang, execution via:\n\t$FIRST_LINE\n"
chmod +x /run/cmd
/run/cmd
else
1>&2 echo " No shebang detected, sourcing /run/cmd..."
source /run/cmd
fi
}
# this function is for runtime setup stuff
initcontainer() {
source "$DIR/init.sh"
1>&2 echo "Foo Container Initialization Complete"
}
# if the user volume-mounted a gcloud directory, assume their identity
if [[ -d /root/.config/gcloud ]]
then
initcontainer
fi
if [[ "$#" == 0 ]]
then # no arguments were supplied
if [ ! -t 1 ]
then
# stdin is not a tty, try /run/cmd
1>&2 echo "No command supplied"
runcmd
else
# stdin is a tty
1>&2 echo "No command supplied, going interactive..."
bash -i
fi
else # arguments were supplied
if [[ "$1" =~ ^gs://.*/.*script$ ]]
then
1>&2 echo "Fetching command from google cloud storage bucket into /run/cmd"
gsutil cp "$1" /run/cmd
runcmd
else
1>&2 echo "Running: \`$@\`"
eval $@
fi
fi
# not my code
from sh import rm, gcloud, gsutil
import json
import sys
# my code
from env import PROJECT, REGION
from show import service_account
# given:
# - A service account you want to run as
# - A script you want to run
# - A timeout for the job (so it doesn't get stuck and cost you $60 like it did for me)
# This function will:
# - Create a temporary service account key for this job
# - Create a storage bucket for this job
# - In it, put a script that:
# - activates the job-runner service account using the temporary key
# - runs the given script with a timeout
# - stashes stdout, stderr, and the return code from the script's execution into the bucket
# - deactivates the temporary key
# It returns the url of the launch script, which is consumed by the entrypoint
def make_job_bucket(timeout, inner_script):
# some local filenames specific to this job
key_file = f'/dev/shm/{job_id}_key.json'
lifecycle_file = f'/dev/shm/{job_id}_lifecycle.json'
launch_script = f'/dev/shm/{job_id}_script'
rm(['-f', key_file, lifecycle_file, launch_script])
# create the storage bucket for this job
print("## Make a bucket for the pending job", file=sys.stderr)
bucket_id = f'gs://{PROJECT}_job_{job_id}'
gsutil(['mb', '-l', REGION, bucket_id])
# define a lifecycle event for this bucket (so it self-deletes eventually)
with open(lifecycle_file, 'w') as file:
file.write(json.dumps({"lifecycle" :
{"rule" :
[
{ "action" : { "type" : "Delete" },
"condition" : { "age" : 5, }
}
]
}
}))
gsutil(['lifecycle', 'set', lifecycle_file, bucket_id])
rm(['-f', lifecycle_file])
print("## Make a key for the service account to use for pending job", file=sys.stderr)
# get the service account that it will run as
sa = list(service_account().keys())[0]
# make a new key for this job only
gcloud(['iam', 'service-accounts', 'keys', 'create', key_file,
'--iam-account', sa])
# read it
with open(key_file, 'r') as file:
key = file.read()
key_id = json.loads(key)['private_key_id']
rm(['-f', key_file])
print("## Generate a script for the job to run at startup", file=sys.stderr)
# this script will be placed in /run/cmd
# (where it is expected that /root/entrypoint looks for instructions)
with open(launch_script, 'w') as file:
file.write(f'''#!/usr/bin/env bash
set -euo pipefail
1>&2 echo "## Injected Script Started"
# assume gcloud and gsutil exist in remote image (otherwise how did it get this script?)
# activate gcloud with the job runner sa key
cat << EOF!! > /dev/shm/sa_key
{key}
EOF!!
1>&2 echo "## Authenticating With Supplied Key"
set -x
gcloud auth activate-service-account --key-file=/dev/shm/sa_key
gcloud config set project {PROJECT}
1>&2 echo "## Running The Job
# (which may rely on the activated account)
mkdir -p /run
cat << EOF!! > /run/inner_script
{inner_script}
EOF!!
chmod +x /run/inner_script
set +ex
printf '\n\n\n'
timeout {timeout} /run/inner_script 1> >(tee /tmp/stdout ) 2> >(tee tmp/stderr >&2 )
printf '\n\n\n'
set -ex
# report what happened with the command
CODE=$?
case $CODE in
0)
echo "Job Command Succeeded"
;;
124)
echo "Job Command Timed Out After {timeout}"
;;
*)
echo "Job command Failed With Code {timeout}"
;;
esac
# extract run results into the bucket
echo $CODE > /tmp/code
gsutil cp /tmp/code {bucket_id}/code
gsutil cp /tmp/stdout {bucket_id}/stdout
gsutil cp /tmp/stderr {bucket_id}/stderr
echo "Copied stdout, stderr, and the return code to {bucket_id}, which will self destruct in 5 days"
# deactivate the key once finished
gcloud iam service-accounts keys delete --iam-account {sa} {key_id} --quiet
echo "Job {job_id} is done"
exit $CODE
''')
# put the script in the bucket (it's up to the image entrypoint to fetch and run it)
gsutil(['cp', launch_script, f'{bucket_id}/run_script'])
return bucket_id
class Job():
def __init__(self, image, command, timeout='72h'):
self.id = foo_id.get()
self.job_name = f'job_{self.id}'
self.image = image
self.bucket = make_job_bucket(self.id, timeout, command)
def _run_job(self):
gcloud(['ai-platform', 'jobs', 'submit', 'training', self.job_name,
'--region', REGION,
'--master-image-uri', self.image,
'--',
f'{self.bucket}/run_script'])
def run_synchronous(self):
self._run_job()
gcloud(['ai-platform', 'jobs', 'stream-logs', self.job_name])
return self.bucket
def run_asynchronous(self):
self._run_job()
gcloud(['ai-platform', 'jobs', 'describe', self.job_name])
return self.bucket
#! /usr/bin/env python3
# this script injects the command below into a ai-platform training job
# the script will be run using the service account configured by job.py
# it's outputs will be stored in a gcp storage bucket
from env import JOB_IMAGE
from job import Job
script = '''#! /usr/bin/env python3
from sh import gcloud
gcloud(['config', 'list', 'account', '--format', 'value(core.account)'])
'''
a_job = Job(JOB_IMAGE, script)
a_job.run_synchronous()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment