Last active
October 4, 2018 14:05
-
-
Save duncan-brown/074442912204a5576fa27a3ffc4db774 to your computer and use it in GitHub Desktop.
Modified version of HTCondor MPI scripts for pycbc_inference. Put the file sshd.sh in ${HOME}/bin and adjust the SSHD_SH variable in openmpiscript to give it the explicit path as ${HOME} may not be expanded properly. Even better, list it as one of the input files to be transferred.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash -v | |
set -x | |
pycbc_config_file=${1} | |
pycbc_output_file=${2} | |
echo "Using ${pycbc_config_file} as configuration file" | |
echo "Writing output to ${pycbc_output_file}" | |
pycbc_seed=11185 | |
pycbc_walkers=200 | |
frames="L1:/home/dbrown/projects/pycbc/pycbc-inference/model-selection/in-n-out-spiral/L1-SIMULATED_GW170817-1187008512-4096.gwf" | |
channels="L1:SIMULATED-GW170817" | |
export PYTHON_EGG_CACHE=$( mktemp -d ) | |
echo $PYTHON_EGG_CACHE | |
XDG_CACHE_HOME=`pwd`/$(mktemp -p . -d)/xdg-cache | |
export XDG_CACHE_HOME | |
mkdir -p ${XDG_CACHE_HOME}/astropy | |
tar -C ${XDG_CACHE_HOME}/astropy -zxvf astropy.tar.gz &>/dev/null | |
echo "XDG_CACHE_HOME set to ${XDG_CACHE_HOME} which contains" `ls ${XDG_CACHE_HOME}` | |
astropy_cache=`python -c 'import astropy; print astropy.config.get_cache_dir()'` | |
echo "Astropy is using ${astropy_cache} which contains" `ls ${astropy_cache}` | |
echo "Fixing astropy cache to use condor cached data for timing" | |
python -c 'import os; import shelve; import astropy.utils.data ; datadir, shelveloc = astropy.utils.data._get_download_cache_locs(); db = shelve.open(shelveloc); hashname = os.path.basename(db["http://maia.usno.navy.mil/ser7/finals2000A.all"]); db["http://maia.usno.navy.mil/ser7/finals2000A.all"] = os.path.join(datadir,hashname); db.close()' | |
echo "Using $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) processors" | |
pycbc_inference --verbose \ | |
--use-mpi \ | |
--seed ${pycbc_seed} \ | |
--instruments L1 \ | |
--gps-start-time $((1187011840 - 300)) \ | |
--gps-end-time $((1187011840 + 10)) \ | |
--frame-files ${frames} \ | |
--channel-name ${channels} \ | |
--strain-high-pass 15 \ | |
--pad-data 8 \ | |
--psd-start-time $((1187008512 + 8)) \ | |
--psd-end-time $((1187008512 + 3072)) \ | |
--psd-estimation mean \ | |
--psd-segment-length 16 \ | |
--psd-segment-stride 8 \ | |
--psd-inverse-length 8 \ | |
--sample-rate 4096 \ | |
--low-frequency-cutoff 20 \ | |
--config-file ${pycbc_config_file} \ | |
--output-file ${pycbc_output_file} \ | |
--processing-scheme mkl \ | |
--sampler emcee_pt \ | |
--ntemps 30 \ | |
--burn-in-function n_acl \ | |
--likelihood-evaluator gaussian \ | |
--nwalkers ${pycbc_walkers} \ | |
--n-independent-samples 8000 \ | |
--checkpoint-interval 2000 \ | |
--nprocesses $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) \ | |
--resume-from-checkpoint | |
exit $? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
##************************************************************** | |
## | |
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department, | |
## University of Wisconsin-Madison, WI. | |
## | |
## Licensed under the Apache License, Version 2.0 (the "License"); you | |
## may not use this file except in compliance with the License. You may | |
## obtain a copy of the License at | |
## | |
## http://www.apache.org/licenses/LICENSE-2.0 | |
## | |
## Unless required by applicable law or agreed to in writing, software | |
## distributed under the License is distributed on an "AS IS" BASIS, | |
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
## See the License for the specific language governing permissions and | |
## limitations under the License. | |
## | |
##************************************************************** | |
# This is a script to run OpenMPI jobs under the HTCondor parallel universe. | |
# OpenMPI assumes that a full install is available on all execute nodes. | |
## sample submit script | |
#universe = parallel | |
#executable = openmpiscript | |
#arguments = actual_mpi_job arg1 arg2 arg3 | |
#getenv = true | |
# | |
#should_transfer_files = yes | |
#transfer_input_files = actual_mpi_job | |
#when_to_transfer_output = on_exit_or_evict | |
# | |
#output = out.$(NODE) | |
#error = err.$(NODE) | |
#log = log | |
# | |
#notification = never | |
#machine_count = 8 | |
#queue | |
## | |
# Fail if any command fails | |
set -x | |
## configuration notes | |
# $MPDIR points to the location of the OpenMPI install | |
# You may set it manually (not recommended) | |
#MPDIR=/usr/lib64/openmpi | |
# The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended) | |
MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH) | |
# $EXINT is a comma-delimited list of excluded network interfaces. | |
# If your mpi jobs are hanging, OpenMPI may be trying to use too many | |
# network interfaces to communicate between nodes. | |
# You may set it manually (not recommended) | |
#EXINT="docker0,virbr0" | |
# The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended) | |
EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES) | |
# We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp | |
# so that OpenMPI caches all data under the user's scratch directory. | |
# Not having /tmp mounted under scratch can also lead to unlink errors, | |
# which may hang mpi jobs. | |
_USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH) | |
if [ -z $_USE_SCRATCH ]; then | |
echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config" | |
elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then | |
echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH" | |
fi | |
# If MPDIR is not set, then use a default value | |
if [ -z $MPDIR ]; then | |
echo "WARNING: Using default value for \$MPDIR in openmpiscript" | |
MPDIR=/usr/lib64/openmpi | |
fi | |
PATH=$MPDIR/bin:.:$PATH | |
export PATH | |
# If EXINT is not set, then use some default values | |
if [ -z $EXINT ]; then | |
echo "WARNING: Using default values for \$EXINT in openmpiscript" | |
EXINT="docker0,virbr0" | |
fi | |
# The condor_ssh and sshd.sh helper scripts reside in $(LIBEXEC) | |
CONDOR_SSH=$(condor_config_val libexec) | |
CONDOR_SSH=$CONDOR_SSH/condor_ssh | |
SSHD_SH=${HOME}/bin | |
SSHD_SH=$SSHD_SH/sshd.sh | |
## | |
# Set up SSHD on the node | |
. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS | |
# Set up mpirun cleanup function | |
_MPIRUN_PID=0 | |
mpirun_cleanup() { | |
echo "Caught SIGTERM, cleaning up..." | |
if [ "$_MPIRUN_PID" -ne "0" ]; then | |
# Send SIGTERM to mpirun | |
echo "Sending SIGTERM to mpirun (${_MPIRUN_PID})..." | |
kill -s SIGTERM $_MPIRUN_PID | |
# Give mpirun 60 seconds to exit nicely before proceeding | |
echo "Waiting for mpirun to exit..." | |
for i in {1..12}; do | |
kill -0 $_MPIRUN_PID 2> /dev/null # Returns 0 if PID is running | |
if [ "$?" -ne "0" ]; then | |
break | |
fi | |
sleep 5 | |
done | |
fi | |
# Cleanup sshd | |
echo "Cleaning up sshd files..." | |
sshd_cleanup | |
rm -f machines | |
echo "Exiting early." | |
exit 1 | |
} | |
# If not the head node, just sleep forever to let the SSHDs run | |
if [ $_CONDOR_PROCNO -ne 0 ] | |
then | |
wait | |
sshd_cleanup | |
exit 0 | |
# If the head node, then set the trap to cleanup mpirun (also does sshd_cleanup) | |
else | |
trap mpirun_cleanup SIGTERM | |
fi | |
EXECUTABLE=$1 | |
shift | |
# The binary is copied but the executable flag may be cleared. | |
chmod +x $EXECUTABLE | |
# Set the location of the contact file | |
CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact | |
export CONDOR_CONTACT_FILE | |
# The second field in the contact file contains the node ranks. | |
# mpirun will use a list a of these node ranks, | |
# and condor_ssh will translate them into a hostname:port. | |
_CONDOR_JOB_AD=.job.ad | |
_CONDOR_REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus) | |
sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $1 " slots='$_CONDOR_REQUEST_CPUS'"}' > machines | |
# Check for which ssh agent to use because one or the other | |
# have each been deprecated at one OpenMPI version or another. | |
_MCA_FAIL=true | |
export LIGO_DATAFIND_SERVER=$LIGO_DATAFIND_SERVER | |
export X509_USER_PROXY=$X509_USER_PROXY | |
set +x | |
for mca_ssh_agent in orte_rsh_agent plm_rsh_agent | |
do | |
if $(ompi_info -a | grep \"${mca_ssh_agent}\" 1>/dev/null 2>&1) | |
then | |
if $(ompi_info -a | grep \"${mca_ssh_agent}\" | grep deprecated 1>/dev/null 2>&1); then continue; fi | |
_MCA_FAIL=false | |
# set MCA values for running on HTCondor | |
export OMPI_MCA_plm_rsh_no_tree_spawn="true" # disable ssh tree spawn | |
export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude network interfaces | |
export LIGO_DATAFIND_SERVER=$LIGO_DATAFIND_SERVER | |
export X509_USER_PROXY=$X509_USER_PROXY | |
#Python Egg bug hack addition, May 17, 2018 SDR | |
PYTHON_EGG_CACHE=$( mktemp -d ) | |
echo $PYTHON_EGG_CACHE | |
export $PYTHON_EGG_CACHE | |
# optionally set MCA values for increasing mpirun verbosity | |
#export OMPI_MCA_plm_base_verbose=30 | |
#export OMPI_MCA_btl_base_verbose=30 | |
# run mpirun in the background and wait for it to exit | |
export _CONDOR_NPROCS _CONDOR_REQUEST_CPUS | |
mpirun -v --prefix $MPDIR --mca $mca_ssh_agent $CONDOR_SSH -n $(( $_CONDOR_NPROCS * $_CONDOR_REQUEST_CPUS )) -hostfile machines -x _CONDOR_NPROCS -x _CONDOR_REQUEST_CPUS -x PYTHONPATH -x PATH -x VIRTUAL_ENV -x LD_LIBRARY_PATH -x LAL_DATA_PATH -x LIGO_DATAFIND_SERVER -x X509_USER_PROXY $EXECUTABLE $@ | |
mpi_return=$? | |
if [ $mpi_return -ne 0 ]; then | |
sshd_cleanup | |
rm -f machines | |
echo "ERROR: mpirun exited with code ${mpi_return}" | |
exit ${mpi_return} | |
fi | |
break | |
fi | |
done | |
if $_MCA_FAIL | |
then | |
echo could not find a suitable MCA ssh agent | |
exit 255 | |
fi | |
sshd_cleanup | |
rm -f machines | |
exit $mpi_return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
universe = parallel | |
getenv = True | |
executable = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/bin/openmpiscript | |
arguments = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/bin/inference_wrapper.sh inspiral.ini /home/dbrown/projects/pycbc/pycbc-inference/model-selection/pe_runs/inspiral/results_inspiral.hdf | |
machine_count = 3 | |
request_cpus = 36 | |
request_memory = 35G | |
requirements = (CpuModelNumber == 85) | |
error = logs/inference_$(cluster).err | |
output = logs/inference_$(cluster).out | |
log = logs/inference_$(cluster).log | |
stream_error = True | |
stream_input = True | |
stream_output = True | |
should_transfer_files = yes | |
transfer_input_files = /home/dbrown/projects/pycbc/pycbc-inference/model-selection/pe_runs/inspiral/inspiral.ini,/home/dbrown/cache/astropy.tar.gz | |
queue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
##************************************************************** | |
## | |
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department, | |
## University of Wisconsin-Madison, WI. | |
## | |
## Licensed under the Apache License, Version 2.0 (the "License"); you | |
## may not use this file except in compliance with the License. You may | |
## obtain a copy of the License at | |
## | |
## http://www.apache.org/licenses/LICENSE-2.0 | |
## | |
## Unless required by applicable law or agreed to in writing, software | |
## distributed under the License is distributed on an "AS IS" BASIS, | |
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
## See the License for the specific language governing permissions and | |
## limitations under the License. | |
## | |
##************************************************************** | |
sshd_cleanup() { | |
rm -f ${hostkey}.dsa ${hostkey}.rsa ${hostkey}.dsa.pub ${hostkey}.rsa.pub ${idkey} ${idkey}.pub $_CONDOR_SCRATCH_DIR/tmp/sshd.out $_CONDOR_SCRATCH_DIR/contact | |
} | |
trap sshd_cleanup SIGTERM | |
# note the sshd requires full path | |
SSHD=`condor_config_val CONDOR_SSHD` | |
KEYGEN=`condor_config_val CONDOR_SSH_KEYGEN` | |
CONDOR_CHIRP=`condor_config_val libexec` | |
CONDOR_CHIRP=$CONDOR_CHIRP/condor_chirp | |
if [ -z "$SSHD" -o -z "$KEYGEN" ] | |
then | |
echo CONDOR_SSHD and/or CONDOR_SSH_KEYGEN are not configured, exiting | |
exit 255 | |
fi | |
PORT=4444 | |
_CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR | |
_CONDOR_PROCNO=$1 | |
_CONDOR_NPROCS=$2 | |
# make a tmp dir to store keys, etc, that | |
# wont get transfered back | |
if [ ! -d $_CONDOR_SCRATCH_DIR/tmp ] ; then | |
mkdir $_CONDOR_SCRATCH_DIR/tmp | |
fi | |
# Create the host keys | |
hostkey=$_CONDOR_SCRATCH_DIR/tmp/hostkey | |
for keytype in dsa rsa | |
do | |
rm -f ${hostkey}.${keytype} ${hostkey}.${keytype}.pub | |
$KEYGEN -q -f ${hostkey}.${keytype} -t $keytype -N '' | |
_TEST=$? | |
if [ $_TEST -ne 0 ] | |
then | |
echo ssh keygenerator $KEYGEN returned error $_TEST exiting | |
exit 255 | |
fi | |
done | |
idkey=$_CONDOR_SCRATCH_DIR/tmp/$_CONDOR_PROCNO.key | |
# Create the identity key | |
$KEYGEN -q -f $idkey -t rsa -N '' | |
_TEST=$? | |
if [ $_TEST -ne 0 ] | |
then | |
echo ssh keygenerator $KEYGEN returned error $_TEST exiting | |
exit 255 | |
fi | |
# Send the identity keys back home | |
$CONDOR_CHIRP put -perm 0700 $idkey $_CONDOR_REMOTE_SPOOL_DIR/$_CONDOR_PROCNO.key | |
_TEST=$? | |
if [ $_TEST -ne 0 ] | |
then | |
echo error $_TEST chirp putting identity keys back | |
exit 255 | |
fi | |
# ssh needs full paths to all of its arguments | |
# Start up sshd | |
done=0 | |
while [ $done -eq 0 ] | |
do | |
# Try to launch sshd on this port | |
$SSHD -p$PORT -oAuthorizedKeysFile=${idkey}.pub -oHostKey=${hostkey}.dsa -oHostKey=${hostkey}.rsa -De -f/dev/null -oStrictModes=no -oPidFile=/dev/null -oAcceptEnv=_CONDOR < /dev/null > $_CONDOR_SCRATCH_DIR/tmp/sshd.out 2>&1 & | |
pid=$! | |
# Give sshd some time | |
sleep 2 | |
if grep "Server listening" $_CONDOR_SCRATCH_DIR/tmp/sshd.out > /dev/null 2>&1 | |
then | |
done=1 | |
else | |
# it is probably dead now | |
#kill -9 $pid > /dev/null 2>&1 | |
PORT=`expr $PORT + 1` | |
fi | |
done | |
# Don't need this anymore | |
rm $_CONDOR_SCRATCH_DIR/tmp/sshd.out | |
# create contact file | |
hostname=`hostname -i` | |
currentDir=`pwd` | |
user=`whoami` | |
thisrun=`$CONDOR_CHIRP get_job_attr EnteredCurrentStatus` | |
echo "$_CONDOR_PROCNO $hostname $PORT $user $currentDir $thisrun" | | |
$CONDOR_CHIRP put -mode cwa - $_CONDOR_REMOTE_SPOOL_DIR/contact | |
_TEST=$? | |
if [ $_TEST -ne 0 ] | |
then | |
echo error $_TEST chirp putting contact info back to submit machine | |
exit 255 | |
fi | |
# On the head node, grep for the contact file and the keys | |
if [ $_CONDOR_PROCNO -eq 0 ] | |
then | |
done=0 | |
count=0 | |
# Need to poll the contact file until all nodes have reported in | |
while [ $done -eq 0 ] | |
do | |
rm -f contact | |
$CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/contact $_CONDOR_SCRATCH_DIR/contact | |
lines=`grep -c $thisrun $_CONDOR_SCRATCH_DIR/contact` | |
if [ $lines -eq $_CONDOR_NPROCS ] | |
then | |
done=1 | |
node=0 | |
while [ $node -ne $_CONDOR_NPROCS ] | |
do | |
$CONDOR_CHIRP fetch $_CONDOR_REMOTE_SPOOL_DIR/$node.key $_CONDOR_SCRATCH_DIR/tmp/$node.key | |
# Now that we've got it, the submit side doesn't need it anymore | |
$CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/$node.key | |
node=`expr $node + 1` | |
done | |
chmod 0700 $_CONDOR_SCRATCH_DIR/tmp/*.key | |
# Erase the contact file from the spool directory, in case | |
# this job is held and rescheduled | |
$CONDOR_CHIRP remove $_CONDOR_REMOTE_SPOOL_DIR/contact | |
else | |
# Wait a second before polling again | |
sleep 1 | |
fi | |
# Timeout after polling 1200 times (about 20 minutes) | |
count=`expr $count + 1` | |
if [ $count -eq 1200 ] | |
then | |
exit 1 | |
fi | |
done | |
fi | |
# We'll source in this file in the MPI startup scripts, | |
# so we can wait and sshd_cleanup over there as needed | |
#wait | |
#sshd_cleanup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment