Last active
December 20, 2015 02:09
-
-
Save melrom/6054356 to your computer and use it in GitHub Desktop.
Remote Job Submission to Stampede
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = "Ole Weidner" | |
__copyright__ = "Copyright 2012-2013, The SAGA Project" | |
__license__ = "MIT" | |
""" This examples shows how to run a job on a remote SLURM cluster | |
using the 'SLURM' job adaptor. | |
More information about the saga-python job API can be found at: | |
http://saga-project.github.com/saga-python/doc/library/job/index.html | |
""" | |
import os | |
import sys | |
import saga | |
USERNAME = os.environ.get('USER') | |
def main(): | |
try: | |
# Your ssh identity on the remote machine. | |
ctx = saga.Context("ssh") | |
# Change e.g., if you have a differnent username on the remote machine | |
ctx.user_id = "sagatut" | |
session = saga.Session() | |
session.add_context(ctx) | |
# Create a job service object that represent a remote pbs cluster. | |
# The keyword 'pbs' in the url scheme triggers the SGE adaptors | |
# and '+ssh' enables SGE remote access via SSH. | |
js = saga.job.Service("slurm+ssh://login1.stampede.tacc.utexas.edu", | |
session=session) | |
# Next, we describe the job we want to run. A complete set of job | |
# description attributes can be found in the API documentation. | |
jd = saga.job.Description() | |
jd.environment = {'FILENAME': 'testfile'} | |
jd.wall_time_limit = 1 # minutes | |
jd.executable = '/bin/touch' | |
jd.arguments = ['$FILENAME'] | |
jd.queue = "normal" | |
jd.working_directory = "/home1/02554/sagatut/XSEDETutorial/%s/SAGA" % USERNAME | |
jd.output = "examplejob.out" | |
jd.error = "examplejob.err" | |
# Create a new job from the job description. The initial state of | |
# the job is 'New'. | |
touchjob = js.create_job(jd) | |
# Check our job's id and state | |
print "Job ID : %s" % (touchjob.id) | |
print "Job State : %s" % (touchjob.state) | |
# Now we can start our job. | |
print "\n...starting job...\n" | |
touchjob.run() | |
print "Job ID : %s" % (touchjob.id) | |
print "Job State : %s" % (touchjob.state) | |
# List all jobs that are known by the adaptor. | |
# This should show our job as well. | |
print "\nListing active jobs: " | |
for job in js.list(): | |
print " * %s" % job | |
# Now we disconnect and reconnect to our job by using the get_job() | |
# method and our job's id. While this doesn't make a lot of sense | |
# here, disconnect / reconnect can become very important for | |
# long-running job. | |
touchjob_clone = js.get_job(touchjob.id) | |
# wait for our job to complete | |
print "\n...waiting for job...\n" | |
touchjob_clone.wait() | |
print "Job State : %s" % (touchjob_clone.state) | |
print "Exitcode : %s" % (touchjob_clone.exit_code) | |
# print "Exec. hosts : %s" % (touchjob_clone.execution_hosts) # not impl. | |
# print "Create time : %s" % (touchjob_clone.created) | |
# print "Start time : %s" % (touchjob_clone.started) | |
# print "End time : %s" % (touchjob_clone.finished) | |
return 0 | |
except saga.SagaException, ex: | |
# Catch all saga exceptions | |
print "An exception occured: (%s) %s " % (ex.type, (str(ex))) | |
# Get the whole traceback in case of an exception - | |
# this can be helpful for debugging the problem | |
print " \n*** Backtrace:\n %s" % ex.traceback | |
return -1 | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment