Created
June 8, 2023 16:37
-
-
Save janxkoci/5655d37158d7a2f78177acce1e752bed to your computer and use it in GitHub Desktop.
rerun failed legofit jobs from job array at MetaCentrum (PBS Pro)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#PBS -l select=1:ncpus=64:mem=1gb,walltime=24:00:00 | |
## GO TO WORKDIR | |
cd $PBS_O_WORKDIR || exit | |
## EXPORT PATH | |
export PATH="/storage/brno2/home/jena/bin:$PATH" | |
export PATH="/storage/brno2/home/jena/miniconda3/bin:$PATH" | |
## LOAD GSL module | |
#ml GSL/2.7-intel-compilers-2021.4.0 # IT4I | |
module add gsl/gsl-2.5-intel-19.0.4-lnarr3s #gsl/2.5-intel-17.0.1 # MetaCentrum | |
## MODEL | |
## qsub -v datdir=hcg2_atgc,model=fg03v,stage=1 <job script> | |
mkdir $datdir/$model | |
## DATA / BOOTSTRAP | |
# get individual tasks from tasklist with index from PBS JOB ARRAY | |
if [ "$rerun" ] # https://stackoverflow.com/a/3601734 | |
then | |
TASK=$(sed -n "${PBS_ARRAY_INDEX}p" $PBS_O_WORKDIR/$datdir/$rerun) | |
else | |
TASK=$(sed -n "${PBS_ARRAY_INDEX}p" $PBS_O_WORKDIR/$datdir/tasklist) | |
fi | |
boot=$(basename -s .opf $TASK) | |
outpref=${datdir}/${model}/${model}_${stage}_${boot} | |
# copy input file and executable to scratch | |
# cp $PBS_O_WORKDIR/$TASK input ; cp $PBS_O_WORKDIR/myprog.x . | |
## LEGOFIT | |
# execute the calculation | |
# ./myprog.x < input > output | |
## STAGE 1 or 3 | |
if [ $stage == 1 ] || [ $stage == 3 ] | |
then | |
legofit -t 64 -1 -S 5000@10000 -S 1000@100000 -S 1000@1000000 \ | |
--stateOut ${outpref}.state \ | |
${model}.lgo $datdir/$TASK \ | |
> ${outpref}.legofit | |
# --stateOut ${model}/${model}_${stage}_${boot}.state \ | |
# > ${model}/${model}_${stage}_${boot}.legofit | |
## STAGE 2 or 4 | |
## FIXME | |
elif [ $stage == 2 ] || [ $stage == 4 ] | |
then | |
prevstage=$(expr $stage - 1) | |
stateInName=${model}/${model}_${prevstage} | |
# TEST "--stateIn ${stateInName}_boot"{0..49}".state" | |
legofit -t 80 -1 --tol 2e-5 -S 1000@2000000 \ | |
${model}.lgo data/${boot}.opf \ | |
--stateIn ${stateInName}_data.state \ | |
$(echo "--stateIn ${stateInName}_boot"{0..49}".state") \ | |
> ${model}/${model}_${stage}_${boot}.legofit | |
else | |
echo "Wrong stage" && exit 255 | |
fi | |
# copy output file to submit directory | |
#cp output $PBS_O_WORKDIR/$TASK.out | |
# --stateIn ${stateInName}_boot1.state --stateIn ${stateInName}_boot2.state --stateIn ${stateInName}_boot3.state --stateIn ${stateInName}_boot4.state --stateIn ${stateInName}_boot5.state --stateIn ${stateInName}_boot6.state --stateIn ${stateInName}_boot7.state --stateIn ${stateInName}_boot8.state --stateIn ${stateInName}_boot9.state --stateIn ${stateInName}_boot10.state --stateIn ${stateInName}_boot11.state --stateIn ${stateInName}_boot12.state --stateIn ${stateInName}_boot13.state --stateIn ${stateInName}_boot14.state --stateIn ${stateInName}_boot15.state --stateIn ${stateInName}_boot16.state --stateIn ${stateInName}_boot17.state --stateIn ${stateInName}_boot18.state --stateIn ${stateInName}_boot19.state --stateIn ${stateInName}_boot20.state --stateIn ${stateInName}_boot21.state --stateIn ${stateInName}_boot22.state --stateIn ${stateInName}_boot23.state --stateIn ${stateInName}_boot24.state --stateIn ${stateInName}_boot25.state --stateIn ${stateInName}_boot26.state --stateIn ${stateInName}_boot27.state --stateIn ${stateInName}_boot28.state --stateIn ${stateInName}_boot29.state --stateIn ${stateInName}_boot30.state --stateIn ${stateInName}_boot31.state --stateIn ${stateInName}_boot32.state --stateIn ${stateInName}_boot33.state --stateIn ${stateInName}_boot34.state --stateIn ${stateInName}_boot35.state --stateIn ${stateInName}_boot36.state --stateIn ${stateInName}_boot37.state --stateIn ${stateInName}_boot38.state --stateIn ${stateInName}_boot39.state --stateIn ${stateInName}_boot40.state --stateIn ${stateInName}_boot41.state --stateIn ${stateInName}_boot42.state --stateIn ${stateInName}_boot43.state --stateIn ${stateInName}_boot44.state --stateIn ${stateInName}_boot45.state --stateIn ${stateInName}_boot46.state --stateIn ${stateInName}_boot47.state --stateIn ${stateInName}_boot48.state --stateIn ${stateInName}_boot49.state \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## bash rerun_filed_legofit.sh hcg2_tv/1Bc | |
path=$1 # hcg2_tv/1Bc/ (data/model) | |
files=${path}/*.legofit | |
## GET PARAMS | |
datdir=$(echo $path | tr "/" "\t" | cut -f 1) | |
model=$(echo $path | tr "/" "\t" | cut -f 2) | |
#echo $datdir $model | |
stage=$(grep -c DiffEv $files \ | |
| gawk -F":" ' | |
$2 == 0 { | |
split($1, arr, "/+") # make sure there really are 3 parts | |
if (length(arr) != 3) { | |
print "check your path!" | |
exit 1 | |
} | |
data = arr[1] | |
model = arr[2] | |
file = arr[3] | |
split(file, arr2, "_") | |
stage = arr2[2] | |
opf = gensub(/legofit/, "opf", 1, arr2[3]) | |
## format: data/boot1.opf > hcg2_tv/1Bc_1.tasklist | |
print "data/" opf > data "/" model "_" stage ".tasklist" | |
} | |
END { | |
print stage | |
} | |
') | |
rerun=${model}_${stage}.tasklist | |
#echo $rerun | |
if [ -f ${datdir}/${rerun} ] | |
then | |
tasks=$(grep -c '' ${datdir}/${rerun}) | |
## SUBMIT LEGOFIT as job array | |
## qsub -A Project_ID -q queue -l select=x:ncpus=y,walltime=[[hh:]mm:]ss[.ms] jobscript | |
qsub -N legofit \ | |
-J 1-$tasks \ | |
-o $HOME/qlogs \ | |
-e $HOME/qlogs \ | |
-v datdir=$datdir,model=$model,stage=$stage,rerun=$rerun \ | |
legofit_jobarray.sh | |
# -W depend=afterany:12623839.meta-pbs.metacentrum.cz \ | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Story
These scripts are for reruning failed legofit jobs at MetaCentrum. MetaCentrum is a Czech national grid of HPC clusters for scientific computing, and legofit is a package for demographic inference we use to study human evolutionary history.
Legofit is typically run with bootstraps replicates and it's best to submit it as job arrays, using e.g.
qsub -J 1-50
in PBS Pro / OpenPBS, the scheduler at MetaCentrum. But due to technical issues at MetaCentrum, some of my jobs failed to finish. These scripts detect which jobs did not finish and resubmit them as new job arrays.The scripts
legofit_jobscript.sh
- This was my usual script for runninglegofit
as job arrays. The only new part is theif [ "$rerun" ]
block that detects if the current job should run as full new job array from standard tasklist or rerun failed jobs from custom tasklist created by the second script.rerun_failed_legofit.sh
- an absolute hack of a script that takes one argument - a partial path to input data and outputs - and figures out if there are any failed jobs, prepares a list of input (data and/or bootstrap replicates) and submits a new job array with the correct data, model, and stage of legofit analysis. The main work is done with within agawk
script that is brutally grafted into the shell script. The rest of the script was mostly copied from my normal script for submitting such arrays.Take home message