Skip to content

Instantly share code, notes, and snippets.

@ShaiberAlon
Last active February 1, 2018 18:50
Show Gist options
  • Save ShaiberAlon/2c698b63c35c492e8fa8d6e667cac6d4 to your computer and use it in GitHub Desktop.
Save ShaiberAlon/2c698b63c35c492e8fa8d6e667cac6d4 to your computer and use it in GitHub Desktop.
marker_gene_workflow.sh
#!/bin/bash
# CALL THIS SCRIPT WITH A PROJECT NAME PARAMETER
set -e
if [ "$#" -ne 1 ]; then
echo "You need to give a project name as an argument to this script (and it better be a single, short and descriptive word without funny characters)."
exit -1
fi
PROJECT_NAME=$1
if [ ! -f "samples.txt" ]
then
echo "
You must have a samples.txt in this directory. It should simply contain
all the sample names.
"
exit 1
fi
CHECK_FILE () {
if [ -s $1 ]
then
continue
else
echo "Error: Expected to find $1, but it is not there!"
exit 1
fi
}
RUN_MED () {
if [ -s "metadata.txt" ]
then
clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10 --sample-mapping metadata.txt
else
clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10
fi
/groups/merenlab/00_RESOURCES/wait_for_cluster.py decompose
CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta
TMP_FILE="$PROJECT_NAME-MED/NODE-REPRESENTATIVES-TMP.fa"
cp $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta $TMP_FILE
sed -i 's/|.*$//g' $TMP_FILE
sed -i 's/-//g' $TMP_FILE
clusterize -n 4 gast -in $TMP_FILE -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast -termg
/groups/merenlab/00_RESOURCES/wait_for_cluster.py gast
CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast
rm -rf $TMP_FILE
python /groups/merenlab/00_RESOURCES/create_taxonomy_matrices.py $PROJECT_NAME-MED
CHECK_FILE $PROJECT_NAME-MED/TAXONOMY-GENUS.txt
}
LOOP_OVER_SAMPLES () {
# $1; command to be executed per sample
# $2; file name to be found once the cluster is done with the job
# $3; wait_for_cluster keyword
# run the command
for sample in `cat samples.txt`
do
eval $1
done
# wait for the cluster to be done
if [ -z $3 ]
then
continue
else
/groups/merenlab/00_RESOURCES/wait_for_cluster.py $3
fi
# if there is an expected output file, look for it
if [ -z $2 ]
then
continue
else
for sample in `cat samples.txt`
do
fname=$(eval "echo $2")
if [ -s $fname ]
then
continue
else
echo "Error: Expected to find $fname, but it is not there!"
exit 1
fi
done
fi
}
############### THINS START HERE ######################
LOOP_OVER_SAMPLES 'clusterize iu-merge-pairs $sample.ini --max-num-mismatches 2 --min-overlap-size 45' '"$sample"_MERGED' 'iu-merge-pairs'
LOOP_OVER_SAMPLES 'clusterize blastn -query "$sample"_MERGED -db /groups/merenlab/00_RESOURCES/phiX/phix.fa -outfmt 6 -out "$sample"_phix.b6 -perc_identity 90 -evalue 0.000001' '"$sample"_phix.b6' 'blastn'
LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/phiX/remove_phix.py "$sample"_MERGED "$sample"_phix.b6' '"$sample"_MERGED_FINAL' 'remove_phix.py'
LOOP_OVER_SAMPLES 'mv "$sample"_MERGED_FINAL $sample.fa'
for sample in `cat samples.txt`; do sed -i 's/|.*$//g' $sample.fa; done
#LOOP_OVER_SAMPLES 'clusterize -n 10 gast -in $sample.fa -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $sample.gast -termg' '$sample.gast' 'gast'
#LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/annotate_fasta_with_gast.py -i $sample.fa -g $sample.gast --overwrite-input' '$sample.fa' 'annotate_fasta_with_gast.py'
rm -rf $PROJECT_NAME.fa
for sample in `cat samples.txt`; do cat $sample.fa >> $PROJECT_NAME.fa; done
o-get-sample-info-from-fasta $PROJECT_NAME.fa > $PROJECT_NAME.info
clusterize o-pad-with-gaps $PROJECT_NAME.fa -o $PROJECT_NAME-PADDED.fa
/groups/merenlab/00_RESOURCES/wait_for_cluster.py o-pad-with-gaps
mv $PROJECT_NAME-PADDED.fa $PROJECT_NAME.fa
RUN_MED
# prepare taxonomy files archive for easy access
rm -rf $PROJECT_NAME-MED-TAXONOMY
mkdir $PROJECT_NAME-MED-TAXONOMY
cp $PROJECT_NAME-MED/TAXONOMY* $PROJECT_NAME-MED-TAXONOMY
tar -zcf $PROJECT_NAME-MED-TAXONOMY.tar.gz $PROJECT_NAME-MED-TAXONOMY/
rm -rf $PROJECT_NAME-MED-TAXONOMY
# clean crap
rm -rf *_MERGED *_MERGED_FINAL *_FAILED* *_MISMATCHES_B* *_phix.b6
rm -rf mothur.* gast.log *.names *.unique.fa
## TOTAL RESET
#rm -rf *_MER* *_MIS* *_phix.b6* *_FAIL* *_STATS*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment