Skip to content

Instantly share code, notes, and snippets.

@melrom
Last active December 20, 2015 02:09
Show Gist options
  • Save melrom/6054356 to your computer and use it in GitHub Desktop.
Save melrom/6054356 to your computer and use it in GitHub Desktop.
Remote Job Submission to Stampede
__author__ = "Ole Weidner"
__copyright__ = "Copyright 2012-2013, The SAGA Project"
__license__ = "MIT"
""" This examples shows how to run a job on a remote SLURM cluster
using the 'SLURM' job adaptor.
More information about the saga-python job API can be found at:
http://saga-project.github.com/saga-python/doc/library/job/index.html
"""
import os
import sys
import saga
USERNAME = os.environ.get('USER')
def main():
try:
# Your ssh identity on the remote machine.
ctx = saga.Context("ssh")
# Change e.g., if you have a differnent username on the remote machine
ctx.user_id = "sagatut"
session = saga.Session()
session.add_context(ctx)
# Create a job service object that represent a remote pbs cluster.
# The keyword 'pbs' in the url scheme triggers the SGE adaptors
# and '+ssh' enables SGE remote access via SSH.
js = saga.job.Service("slurm+ssh://login1.stampede.tacc.utexas.edu",
session=session)
# Next, we describe the job we want to run. A complete set of job
# description attributes can be found in the API documentation.
jd = saga.job.Description()
jd.environment = {'FILENAME': 'testfile'}
jd.wall_time_limit = 1 # minutes
jd.executable = '/bin/touch'
jd.arguments = ['$FILENAME']
jd.queue = "normal"
jd.working_directory = "/home1/02554/sagatut/XSEDETutorial/%s/SAGA" % USERNAME
jd.output = "examplejob.out"
jd.error = "examplejob.err"
# Create a new job from the job description. The initial state of
# the job is 'New'.
touchjob = js.create_job(jd)
# Check our job's id and state
print "Job ID : %s" % (touchjob.id)
print "Job State : %s" % (touchjob.state)
# Now we can start our job.
print "\n...starting job...\n"
touchjob.run()
print "Job ID : %s" % (touchjob.id)
print "Job State : %s" % (touchjob.state)
# List all jobs that are known by the adaptor.
# This should show our job as well.
print "\nListing active jobs: "
for job in js.list():
print " * %s" % job
# Now we disconnect and reconnect to our job by using the get_job()
# method and our job's id. While this doesn't make a lot of sense
# here, disconnect / reconnect can become very important for
# long-running job.
touchjob_clone = js.get_job(touchjob.id)
# wait for our job to complete
print "\n...waiting for job...\n"
touchjob_clone.wait()
print "Job State : %s" % (touchjob_clone.state)
print "Exitcode : %s" % (touchjob_clone.exit_code)
# print "Exec. hosts : %s" % (touchjob_clone.execution_hosts) # not impl.
# print "Create time : %s" % (touchjob_clone.created)
# print "Start time : %s" % (touchjob_clone.started)
# print "End time : %s" % (touchjob_clone.finished)
return 0
except saga.SagaException, ex:
# Catch all saga exceptions
print "An exception occured: (%s) %s " % (ex.type, (str(ex)))
# Get the whole traceback in case of an exception -
# this can be helpful for debugging the problem
print " \n*** Backtrace:\n %s" % ex.traceback
return -1
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment