Skip to content

Instantly share code, notes, and snippets.

@duncan-brown
Last active October 4, 2018 14:05
Show Gist options
  • Save duncan-brown/074442912204a5576fa27a3ffc4db774 to your computer and use it in GitHub Desktop.
Save duncan-brown/074442912204a5576fa27a3ffc4db774 to your computer and use it in GitHub Desktop.
Modified version of HTCondor MPI scripts for pycbc_inference. Put the file sshd.sh in ${HOME}/bin and adjust the SSHD_SH variable in openmpiscript to give it the explicit path as ${HOME} may not be expanded properly. Even better, list it as one of the input files to be transferred.
#! /bin/bash -v
set -x
pycbc_config_file=${1}
pycbc_output_file=${2}
echo "Using ${pycbc_config_file} as configuration file"
echo "Writing output to ${pycbc_output_file}"
pycbc_seed=11185
pycbc_walkers=200
frames="L1:/home/dbrown/projects/pycbc/pycbc-inference/model-selection/in-n-out-spiral/L1-SIMULATED_GW170817-1187008512-4096.gwf"
channels="L1:SIMULATED-GW170817"
export PYTHON_EGG_CACHE=$( mktemp -d )
echo $PYTHON_EGG_CACHE
XDG_CACHE_HOME=`pwd`/$(mktemp -p . -d)/xdg-cache
export XDG_CACHE_HOME
mkdir -p ${XDG_CACHE_HOME}/astropy
tar -C ${XDG_CACHE_HOME}/astropy -zxvf astropy.tar.gz &>/dev/null
echo "XDG_CACHE_HOME set to ${XDG_CACHE_HOME} which contains" `ls ${XDG_CACHE_HOME}`
astropy_cache=`python -c 'import astropy; print astropy.config.get_cache_dir()'`
echo "Astropy is using ${astropy_cache} which contains" `ls ${astropy_cache}`
echo "Fixing astropy cache to use condor cached data for timing"
python -c 'import os; import shelve; import astropy.utils.data ; datadir, shelveloc = astropy.utils.data._get_download_cache_locs(); db = shelve.open(shelveloc); hashname = os.path.basename(db["http://maia.usno.navy.mil/ser7/finals2000A.all"]); db["http://maia.usno.navy.mil/ser7/finals2000A.all"] = os.path.join(datadir,hashname); db.close()'
echo "Using $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) processors"
pycbc_inference --verbose \
--use-mpi \
--seed ${pycbc_seed} \
--instruments L1 \
--gps-start-time $((1187011840 - 300)) \
--gps-end-time $((1187011840 + 10)) \
--frame-files ${frames} \
--channel-name ${channels} \
--strain-high-pass 15 \
--pad-data 8 \
--psd-start-time $((1187008512 + 8)) \
--psd-end-time $((1187008512 + 3072)) \
--psd-estimation mean \
--psd-segment-length 16 \
--psd-segment-stride 8 \
--psd-inverse-length 8 \
--sample-rate 4096 \
--low-frequency-cutoff 20 \
--config-file ${pycbc_config_file} \
--output-file ${pycbc_output_file} \
--processing-scheme mkl \
--sampler emcee_pt \
--ntemps 30 \
--burn-in-function n_acl \
--likelihood-evaluator gaussian \
--nwalkers ${pycbc_walkers} \
--n-independent-samples 8000 \
--checkpoint-interval 2000 \
--nprocesses $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) \
--resume-from-checkpoint
exit $?
#!/usr/bin/env bash
##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
##
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License. You may
## obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************
# This is a script to run OpenMPI jobs under the HTCondor parallel universe.
# OpenMPI assumes that a full install is available on all execute nodes.
## sample submit script
#universe = parallel
#executable = openmpiscript
#arguments = actual_mpi_job arg1 arg2 arg3
#getenv = true
#
#should_transfer_files = yes
#transfer_input_files = actual_mpi_job
#when_to_transfer_output = on_exit_or_evict
#
#output = out.$(NODE)
#error = err.$(NODE)
#log = log
#
#notification = never
#machine_count = 8
#queue
##
# Fail if any command fails
set -x
## configuration notes
# $MPDIR points to the location of the OpenMPI install
# You may set it manually (not recommended)
#MPDIR=/usr/lib64/openmpi
# The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended)
MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH)
# $EXINT is a comma-delimited list of excluded network interfaces.
# If your mpi jobs are hanging, OpenMPI may be trying to use too many
# network interfaces to communicate between nodes.
# You may set it manually (not recommended)
#EXINT="docker0,virbr0"
# The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended)
EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES)
# We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp
# so that OpenMPI caches all data under the user's scratch directory.
# Not having /tmp mounted under scratch can also lead to unlink errors,
# which may hang mpi jobs.
_USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH)
if [ -z $_USE_SCRATCH ]; then
echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config"
elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then
echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH"
fi
# If MPDIR is not set, then use a default value
if [ -z $MPDIR ]; then
echo "WARNING: Using default value for \$MPDIR in openmpiscript"
MPDIR=/usr/lib64/openmpi
fi
PATH=$MPDIR/bin:.:$PATH
export PATH
# If EXINT is not set, then use some default values
if [ -z $EXINT ]; then
echo "WARNING: Using default values for \$EXINT in openmpiscript"
EXINT="docker0,virbr0"
fi
# The condor_ssh and sshd.sh helper scripts reside in $(LIBEXEC)
CONDOR_SSH=$(condor_config_val libexec)
CONDOR_SSH=$CONDOR_SSH/condor_ssh
SSHD_SH=${HOME}/bin
SSHD_SH=$SSHD_SH/sshd.sh
##
# Set up SSHD on the node
. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS
# Set up mpirun cleanup function
_MPIRUN_PID=0
mpirun_cleanup() {
echo "Caught SIGTERM, cleaning up..."
if [ "$_MPIRUN_PID" -ne "0" ]; then
# Send SIGTERM to mpirun
echo "Sending SIGTERM to mpirun (${_MPIRUN_PID})..."
kill -s SIGTERM $_MPIRUN_PID
# Give mpirun 60 seconds to exit nicely before proceeding
echo "Waiting for mpirun to exit..."
for i in {1..12}; do
kill -0 $_MPIRUN_PID 2> /dev/null # Returns 0 if PID is running
if [ "$?" -ne "0" ]; then
break
fi
sleep 5
done
fi
# Cleanup sshd
echo "Cleaning up sshd files..."
sshd_cleanup
rm -f machines
echo "Exiting early."
exit 1
}
# If not the head node, just sleep forever to let the SSHDs run
if [ $_CONDOR_PROCNO -ne 0 ]
then
wait
sshd_cleanup
exit 0
# If the head node, then set the trap to cleanup mpirun (also does sshd_cleanup)
else
trap mpirun_cleanup SIGTERM
fi
EXECUTABLE=$1
shift
# The binary is copied but the executable flag may be cleared.
chmod +x $EXECUTABLE
# Set the location of the contact file
CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export CONDOR_CONTACT_FILE
# The second field in the contact file contains the node ranks.
# mpirun will use a list a of these node ranks,
# and condor_ssh will translate them into a hostname:port.
_CONDOR_JOB_AD=.job.ad
_CONDOR_REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus)
sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $1 " slots='$_CONDOR_REQUEST_CPUS'"}' > machines
# Check for which ssh agent to use because one or the other
# have each been deprecated at one OpenMPI version or another.
_MCA_FAIL=true
export LIGO_DATAFIND_SERVER=$LIGO_DATAFIND_SERVER
export X509_USER_PROXY=$X509_USER_PROXY
set +x
for mca_ssh_agent in orte_rsh_agent plm_rsh_agent
do
if $(ompi_info -a | grep \"${mca_ssh_agent}\" 1>/dev/null 2>&1)
then
if $(ompi_info -a | grep \"${mca_ssh_agent}\" | grep deprecated 1>/dev/null 2>&1); then continue; fi
_MCA_FAIL=false
# set MCA values for running on HTCondor
export OMPI_MCA_plm_rsh_no_tree_spawn="true" # disable ssh tree spawn
export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude network interfaces
export LIGO_DATAFIND_SERVER=$LIGO_DATAFIND_SERVER
export X509_USER_PROXY=$X509_USER_PROXY
#Python Egg bug hack addition, May 17, 2018 SDR
PYTHON_EGG_CACHE=$( mktemp -d )
echo $PYTHON_EGG_CACHE
export $PYTHON_EGG_CACHE
# optionally set MCA values for increasing mpirun verbosity
#export OMPI_MCA_plm_base_verbose=30
#export OMPI_MCA_btl_base_verbose=30
# run mpirun in the background and wait for it to exit
export _CONDOR_NPROCS _CONDOR_REQUEST_CPUS
mpirun -v --prefix $MPDIR --mca $mca_ssh_agent $CONDOR_SSH -n $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) -hostfile machines -x _CONDOR_NPROCS -x _CONDOR_REQUEST_CPUS -x PYTHONPATH -x PATH -x VIRTUAL_ENV -x LD_LIBRARY_PATH -x LAL_DATA_PATH -x LIGO_DATAFIND_SERVER -x X509_USER_PROXY $EXECUTABLE $@
mpi_return=$?
if [ $mpi_return -ne 0 ]; then
sshd_cleanup
rm -f machines
echo "ERROR: mpirun exited with code ${mpi_return}"
exit ${mpi_return}
fi
break
fi
done
if $_MCA_FAIL
then
echo could not find a suitable MCA ssh agent
exit 255
fi
sshd_cleanup
rm -f machines
exit $mpi_return
universe = parallel
getenv = True
executable = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/bin/openmpiscript
arguments = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/bin/inference_wrapper.sh inspiral.ini /home/dbrown/projects/pycbc/pycbc-inference/model-selection/pe_runs/inspiral/results_inspiral.hdf
machine_count = 3
request_cpus = 36
request_memory = 35G
requirements = (CpuModelNumber == 85)
error = logs/inference_$(cluster).err
output = logs/inference_$(cluster).out
log = logs/inference_$(cluster).log
stream_error = True
stream_input = True
stream_output = True
should_transfer_files = yes
transfer_input_files = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/pe_runs/inspiral/inspiral.ini,/home/dbrown/cache/astropy.tar.gz
queue
#!/bin/sh
##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
##
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License. You may
## obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************
sshd_cleanup() {
rm -f ${hostkey}.dsa ${hostkey}.rsa ${hostkey}.dsa.pub ${hostkey}.rsa.pub ${idkey} ${idkey}.pub $_CONDOR_SCRATCH_DIR/tmp/sshd.out $_CONDOR_SCRATCH_DIR/contact
}
trap sshd_cleanup SIGTERM
# note the sshd requires full path
SSHD=`condor_config_val CONDOR_SSHD`
KEYGEN=`condor_config_val CONDOR_SSH_KEYGEN`
CONDOR_CHIRP=`condor_config_val libexec`
CONDOR_CHIRP=$CONDOR_CHIRP/condor_chirp
if [ -z "$SSHD" -o -z "$KEYGEN" ]
then
echo CONDOR_SSHD and/or CONDOR_SSH_KEYGEN are not configured, exiting
exit 255
fi
PORT=4444
_CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR
_CONDOR_PROCNO=$1
_CONDOR_NPROCS=$2
# make a tmp dir to store keys, etc, that
# wont get transfered back
if [ ! -d $_CONDOR_SCRATCH_DIR/tmp ] ; then
mkdir $_CONDOR_SCRATCH_DIR/tmp
fi
# Create the host keys
hostkey=$_CONDOR_SCRATCH_DIR/tmp/hostkey
for keytype in dsa rsa
do
rm -f ${hostkey}.${keytype} ${hostkey}.${keytype}.pub
$KEYGEN -q -f ${hostkey}.${keytype} -t $keytype -N ''
_TEST=$?
if [ $_TEST -ne 0 ]
then
echo ssh keygenerator $KEYGEN returned error $_TEST exiting
exit 255
fi
done
idkey=$_CONDOR_SCRATCH_DIR/tmp/$_CONDOR_PROCNO.key
# Create the identity key
$KEYGEN -q -f $idkey -t rsa -N ''
_TEST=$?
if [ $_TEST -ne 0 ]
then
echo ssh keygenerator $KEYGEN returned error $_TEST exiting
exit 255
fi
# Send the identity keys back home
$CONDOR_CHIRP put -perm 0700 $idkey $_CONDOR_REMOTE_SPOOL_DIR/$_CONDOR_PROCNO.key
_TEST=$?
if [ $_TEST -ne 0 ]
then
echo error $_TEST chirp putting identity keys back
exit 255
fi
# ssh needs full paths to all of its arguments
# Start up sshd
done=0
while [ $done -eq 0 ]
do
# Try to launch sshd on this port
$SSHD -p$PORT -oAuthorizedKeysFile=${idkey}.pub -oHostKey=${hostkey}.dsa -oHostKey=${hostkey}.rsa -De -f/dev/null -oStrictModes=no -oPidFile=/dev/null -oAcceptEnv=_CONDOR < /dev/null > $_CONDOR_SCRATCH_DIR/tmp/sshd.out 2>&1 &
pid=$!
# Give sshd some time
sleep 2
if grep "Server listening" $_CONDOR_SCRATCH_DIR/tmp/sshd.out > /dev/null 2>&1
then
done=1
else
# it is probably dead now
#kill -9 $pid > /dev/null 2>&1
PORT=`expr $PORT + 1`
fi
done
# Don't need this anymore
rm $_CONDOR_SCRATCH_DIR/tmp/sshd.out
# create contact file
hostname=`hostname -i`
currentDir=`pwd`
user=`whoami`
thisrun=`$CONDOR_CHIRP get_job_attr EnteredCurrentStatus`
echo "$_CONDOR_PROCNO $hostname $PORT $user $currentDir $thisrun" |
$CONDOR_CHIRP put -mode cwa - $_CONDOR_REMOTE_SPOOL_DIR/contact
_TEST=$?
if [ $_TEST -ne 0 ]
then
echo error $_TEST chirp putting contact info back to submit machine
exit 255
fi
# On the head node, grep for the contact file and the keys
if [ $_CONDOR_PROCNO -eq 0 ]
then
done=0
count=0
# Need to poll the contact file until all nodes have reported in
while [ $done -eq 0 ]
do
rm -f contact
$CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/contact $_CONDOR_SCRATCH_DIR/contact
lines=`grep -c $thisrun $_CONDOR_SCRATCH_DIR/contact`
if [ $lines -eq $_CONDOR_NPROCS ]
then
done=1
node=0
while [ $node -ne $_CONDOR_NPROCS ]
do
$CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/$node.key $_CONDOR_SCRATCH_DIR/tmp/$node.key
# Now that we've got it, the submit side doesn't need it anymore
$CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/$node.key
node=`expr $node + 1`
done
chmod 0700 $_CONDOR_SCRATCH_DIR/tmp/*.key
# Erase the contact file from the spool directory, in case
# this job is held and rescheduled
$CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/contact
else
# Wait a second before polling again
sleep 1
fi
# Timeout after polling 1200 times (about 20 minutes)
count=`expr $count + 1`
if [ $count -eq 1200 ]
then
exit 1
fi
done
fi
# We'll source in this file in the MPI startup scripts,
# so we can wait and sshd_cleanup over there as needed
#wait
#sshd_cleanup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment