The "Campus Factory" script basically monitors schedds for idle jobs and submits as appropriate to the remote cluster. A typical submission looks like this:
condor_submit /home/ruc.clemson/bosco/libexec/campus_factory/share/glidein_jobs/job.submit.template -a REMOTE_SCHEDULER="pbs" -a WN_TMP="/local_scratch" -a GLIDEIN_Site="osgconnect@user.palmetto.clemson.edu/pbs" -a GLIDEIN_DIR="/home/ruc.clemson/bosco/libexec/campus_factory/share/glidein_jobs" -a REMOTE_FACTORY="/scratch1/osgconnect/bosco/clemson/campus_factory" -a PASSWDFILE_LOCATION="/home/ruc.clemson/bosco/local.bosco/passwdfile" -a BOSCOCluster="osgconnect@user.palmetto.clemson.edu/pbs" -a GLIDEIN_HOST="rccf-osg.ci-connect.net:11012?sock=collector" -a REMOTE_CLUSTER="osgconnect@user.palmetto.clemson.edu"
I suspect APF can completely replace this component.
The job submit template looks like this:
# Glidein Job submission
# Define the default variables that will be used below
# NOTE - These will be automatically overwritten by configuration options
BOSCOCluster = None # From Cluster Hosts in campus_factory.conf
GLIDEIN_HOST = $(BOSCOCluster) # From COLLECTOR_HOST from condor_config
WN_TMP = /local_scratch # From worker_tmp from campus_factory.conf
Universe = Grid
Executable = $(GLIDEIN_DIR)/glidein_wrapper.sh
Arguments = -dyn -f
Environment = _condor_CONDOR_HOST=$(GLIDEIN_HOST); \
_condor_COLLECTOR_HOST=$(GLIDEIN_HOST); \
_condor_GLIDEIN_HOST=$(GLIDEIN_HOST); \
_condor_CONDOR_ADMIN=condor@$(GLIDEIN_HOST); \
_condor_NUM_CPUS=1; \
_condor_UID_DOMAIN=$(GLIDEIN_HOST); \
_condor_FILESYSTEM_DOMAIN=$(GLIDEIN_HOST); \
_condor_MAIL=/bin/mail; \
_condor_GLIDEIN_Site="$(GLIDEIN_Site)"; \
_condor_BOSCOCluster="$(BOSCOCluster)"; \
_campusfactory_wntmp=$(WN_TMP); \
_campusfactory_CAMPUSFACTORY_LOCATION=$(REMOTE_FACTORY)
transfer_input_files = $(GLIDEIN_DIR)/glidein_condor_config, \
$(GLIDEIN_DIR)/user_job_wrapper.sh, \
$(GLIDEIN_DIR)/functions.sh, \
$(GLIDEIN_DIR)/glidein_startup, \
$(GLIDEIN_DIR)/lockfile, \
$(GLIDEIN_DIR)/exec_wrapper.sh, \
$(GLIDEIN_DIR)/SlotIsHealthy.sh, \
$(GLIDEIN_DIR)/glideinExec.tar.gz, \
$(GLIDEIN_DIR)/connect.tar.gz, \
$(PASSWDFILE_LOCATION)
should_transfer_files = YES
when_to_transfer_output = ON_EXIT
output = output
error = error
# Remove the job if it gets held for too long (1 hour)
PeriodicRemove = (JobStatus == 5 && time() - EnteredCurrentStatus > 300*1*1)
GlobusRSL =
Grid_Resource = batch $(REMOTE_SCHEDULER) $(REMOTE_CLUSTER)
+GlideinJob=TRUE
+BOSCOCluster="$(BOSCOCluster)"
Notification = Never
Queue
Some stuff seems to happen within the condor_gridmanager that invokes whatever is defined at $(REMOTE_GAHP). This seems to be roughly around line 237 in htcondor/src/condor_gridmanager/infnbatchjob.cpp:
if ( gahp_args.Count() > 0 ) {
gahp_path = param( "REMOTE_GAHP" );
if ( gahp_path == NULL ) {
formatstr( error_string, "REMOTE_GAHP not defined" );
goto error_exit;
}
} else {
// CRUFT: BATCH_GAHP was added in 7.7.6.
// Checking <batch-type>_GAHP should be removed at some
// point in the future.
if ( strcasecmp( batchType, "condor" ) ) {
formatstr( buff, "%s_GAHP", batchType );
gahp_path = param(buff.c_str());
}
if ( gahp_path == NULL ) {
gahp_path = param( "BATCH_GAHP" );
if ( gahp_path == NULL ) {
formatstr( error_string, "Neither %s nor %s defined", buff.c_str(),
"BATCH_GAHP" );
goto error_exit;
}
}
}
For us, $(REMOTE_GAHP) is defined to be
$ condor_config_val REMOTE_GAHP
/home/ruc.clemson/bosco/sbin/remote_gahp
which is a shim script that sets up the SSH tunnel between our factory and the remote clusters and runs $REMOTE_GLITE/bin/batch_gahp.
On the remote side in this case, $REMOTE_GLITE resolves to: /home/osgconnect/bosco/rccf-osg.ci-connect.net/clemson/glite
which is a directory created on the remote side during the "bosco_setup" phase, containing the BLAH scripts and batch_gahp:
$ ./batch_gahp
sh: /etc/batch_gahp.config: No such file or directory
I don't know much about GAHP, but I assume this is where the conversion magic actually happens on the remote side.