Skip to content

Instantly share code, notes, and snippets.

@LincolnBryant
Created January 11, 2016 19:38
Show Gist options
  • Save LincolnBryant/03f0af2f291611704565 to your computer and use it in GitHub Desktop.
Save LincolnBryant/03f0af2f291611704565 to your computer and use it in GitHub Desktop.

The "Campus Factory" script basically monitors schedds for idle jobs and submits as appropriate to the remote cluster. A typical submission looks like this:

condor_submit /home/ruc.clemson/bosco/libexec/campus_factory/share/glidein_jobs/job.submit.template  -a REMOTE_SCHEDULER="pbs" -a WN_TMP="/local_scratch" -a GLIDEIN_Site="osgconnect@user.palmetto.clemson.edu/pbs" -a GLIDEIN_DIR="/home/ruc.clemson/bosco/libexec/campus_factory/share/glidein_jobs" -a REMOTE_FACTORY="/scratch1/osgconnect/bosco/clemson/campus_factory" -a PASSWDFILE_LOCATION="/home/ruc.clemson/bosco/local.bosco/passwdfile" -a BOSCOCluster="osgconnect@user.palmetto.clemson.edu/pbs" -a GLIDEIN_HOST="rccf-osg.ci-connect.net:11012?sock=collector" -a REMOTE_CLUSTER="osgconnect@user.palmetto.clemson.edu"

I suspect APF can completely replace this component.

The job submit template looks like this:

# Glidein Job submission


# Define the default variables that will be used below
# NOTE - These will be automatically overwritten by configuration options
BOSCOCluster = None				# From Cluster Hosts in campus_factory.conf
GLIDEIN_HOST = $(BOSCOCluster)          # From COLLECTOR_HOST from condor_config
WN_TMP = /local_scratch                # From worker_tmp from campus_factory.conf


Universe = Grid

Executable = $(GLIDEIN_DIR)/glidein_wrapper.sh

Arguments = -dyn -f

Environment =	_condor_CONDOR_HOST=$(GLIDEIN_HOST);			\
		_condor_COLLECTOR_HOST=$(GLIDEIN_HOST);			\
		_condor_GLIDEIN_HOST=$(GLIDEIN_HOST);			\
		_condor_CONDOR_ADMIN=condor@$(GLIDEIN_HOST);		\
		_condor_NUM_CPUS=1;			\
		_condor_UID_DOMAIN=$(GLIDEIN_HOST);			\
		_condor_FILESYSTEM_DOMAIN=$(GLIDEIN_HOST);		\
		_condor_MAIL=/bin/mail;					\
		_condor_GLIDEIN_Site="$(GLIDEIN_Site)";			\
		_condor_BOSCOCluster="$(BOSCOCluster)";			\
		_campusfactory_wntmp=$(WN_TMP);				\
		_campusfactory_CAMPUSFACTORY_LOCATION=$(REMOTE_FACTORY)

transfer_input_files =	$(GLIDEIN_DIR)/glidein_condor_config,		\
			$(GLIDEIN_DIR)/user_job_wrapper.sh,		\
			$(GLIDEIN_DIR)/functions.sh,			\
			$(GLIDEIN_DIR)/glidein_startup,			\
			$(GLIDEIN_DIR)/lockfile,			\
			$(GLIDEIN_DIR)/exec_wrapper.sh,			\
			$(GLIDEIN_DIR)/SlotIsHealthy.sh,		\
			$(GLIDEIN_DIR)/glideinExec.tar.gz,		\
			$(GLIDEIN_DIR)/connect.tar.gz,			\
			$(PASSWDFILE_LOCATION)

should_transfer_files = YES
when_to_transfer_output = ON_EXIT
output = output
error = error

# Remove the job if it gets held for too long (1 hour)
PeriodicRemove = (JobStatus == 5 && time() - EnteredCurrentStatus > 300*1*1)
GlobusRSL =

Grid_Resource = batch $(REMOTE_SCHEDULER) $(REMOTE_CLUSTER)
+GlideinJob=TRUE
+BOSCOCluster="$(BOSCOCluster)"

Notification = Never
Queue

Some stuff seems to happen within the condor_gridmanager that invokes whatever is defined at $(REMOTE_GAHP). This seems to be roughly around line 237 in htcondor/src/condor_gridmanager/infnbatchjob.cpp:

  if ( gahp_args.Count() > 0 ) {
    gahp_path = param( "REMOTE_GAHP" );
    if ( gahp_path == NULL ) {
      formatstr( error_string, "REMOTE_GAHP not defined" );
      goto error_exit;
    }
  } else {
    // CRUFT: BATCH_GAHP was added in 7.7.6.
    //   Checking <batch-type>_GAHP should be removed at some
    //   point in the future.
    if ( strcasecmp( batchType, "condor" ) ) {
      formatstr( buff, "%s_GAHP", batchType );
      gahp_path = param(buff.c_str());
    }
    if ( gahp_path == NULL ) {
      gahp_path = param( "BATCH_GAHP" );
      if ( gahp_path == NULL ) {
        formatstr( error_string, "Neither %s nor %s defined", buff.c_str(),
             "BATCH_GAHP" );
        goto error_exit;
      }
    }
  }

For us, $(REMOTE_GAHP) is defined to be

$ condor_config_val REMOTE_GAHP
/home/ruc.clemson/bosco/sbin/remote_gahp

which is a shim script that sets up the SSH tunnel between our factory and the remote clusters and runs $REMOTE_GLITE/bin/batch_gahp.

On the remote side in this case, $REMOTE_GLITE resolves to: /home/osgconnect/bosco/rccf-osg.ci-connect.net/clemson/glite

which is a directory created on the remote side during the "bosco_setup" phase, containing the BLAH scripts and batch_gahp:

$ ./batch_gahp sh: /etc/batch_gahp.config: No such file or directory $GahpVersion: 1.16.5 Mar 31 2008 INFN\ blahpd\ (poly,new_esc_format) $

I don't know much about GAHP, but I assume this is where the conversion magic actually happens on the remote side.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment