Last active
February 1, 2018 18:50
-
-
Save ShaiberAlon/2c698b63c35c492e8fa8d6e667cac6d4 to your computer and use it in GitHub Desktop.
marker_gene_workflow.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# CALL THIS SCRIPT WITH A PROJECT NAME PARAMETER | |
set -e | |
if [ "$#" -ne 1 ]; then | |
echo "You need to give a project name as an argument to this script (and it better be a single, short and descriptive word without funny characters)." | |
exit -1 | |
fi | |
PROJECT_NAME=$1 | |
if [ ! -f "samples.txt" ] | |
then | |
echo " | |
You must have a samples.txt in this directory. It should simply contain | |
all the sample names. | |
" | |
exit 1 | |
fi | |
CHECK_FILE () { | |
if [ -s $1 ] | |
then | |
continue | |
else | |
echo "Error: Expected to find $1, but it is not there!" | |
exit 1 | |
fi | |
} | |
RUN_MED () { | |
if [ -s "metadata.txt" ] | |
then | |
clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10 --sample-mapping metadata.txt | |
else | |
clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10 | |
fi | |
/groups/merenlab/00_RESOURCES/wait_for_cluster.py decompose | |
CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta | |
TMP_FILE="$PROJECT_NAME-MED/NODE-REPRESENTATIVES-TMP.fa" | |
cp $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta $TMP_FILE | |
sed -i 's/|.*$//g' $TMP_FILE | |
sed -i 's/-//g' $TMP_FILE | |
clusterize -n 4 gast -in $TMP_FILE -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast -termg | |
/groups/merenlab/00_RESOURCES/wait_for_cluster.py gast | |
CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast | |
rm -rf $TMP_FILE | |
python /groups/merenlab/00_RESOURCES/create_taxonomy_matrices.py $PROJECT_NAME-MED | |
CHECK_FILE $PROJECT_NAME-MED/TAXONOMY-GENUS.txt | |
} | |
LOOP_OVER_SAMPLES () { | |
# $1; command to be executed per sample | |
# $2; file name to be found once the cluster is done with the job | |
# $3; wait_for_cluster keyword | |
# run the command | |
for sample in `cat samples.txt` | |
do | |
eval $1 | |
done | |
# wait for the cluster to be done | |
if [ -z $3 ] | |
then | |
continue | |
else | |
/groups/merenlab/00_RESOURCES/wait_for_cluster.py $3 | |
fi | |
# if there is an expected output file, look for it | |
if [ -z $2 ] | |
then | |
continue | |
else | |
for sample in `cat samples.txt` | |
do | |
fname=$(eval "echo $2") | |
if [ -s $fname ] | |
then | |
continue | |
else | |
echo "Error: Expected to find $fname, but it is not there!" | |
exit 1 | |
fi | |
done | |
fi | |
} | |
############### THINS START HERE ###################### | |
LOOP_OVER_SAMPLES 'clusterize iu-merge-pairs $sample.ini --max-num-mismatches 2 --min-overlap-size 45' '"$sample"_MERGED' 'iu-merge-pairs' | |
LOOP_OVER_SAMPLES 'clusterize blastn -query "$sample"_MERGED -db /groups/merenlab/00_RESOURCES/phiX/phix.fa -outfmt 6 -out "$sample"_phix.b6 -perc_identity 90 -evalue 0.000001' '"$sample"_phix.b6' 'blastn' | |
LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/phiX/remove_phix.py "$sample"_MERGED "$sample"_phix.b6' '"$sample"_MERGED_FINAL' 'remove_phix.py' | |
LOOP_OVER_SAMPLES 'mv "$sample"_MERGED_FINAL $sample.fa' | |
for sample in `cat samples.txt`; do sed -i 's/|.*$//g' $sample.fa; done | |
#LOOP_OVER_SAMPLES 'clusterize -n 10 gast -in $sample.fa -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $sample.gast -termg' '$sample.gast' 'gast' | |
#LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/annotate_fasta_with_gast.py -i $sample.fa -g $sample.gast --overwrite-input' '$sample.fa' 'annotate_fasta_with_gast.py' | |
rm -rf $PROJECT_NAME.fa | |
for sample in `cat samples.txt`; do cat $sample.fa >> $PROJECT_NAME.fa; done | |
o-get-sample-info-from-fasta $PROJECT_NAME.fa > $PROJECT_NAME.info | |
clusterize o-pad-with-gaps $PROJECT_NAME.fa -o $PROJECT_NAME-PADDED.fa | |
/groups/merenlab/00_RESOURCES/wait_for_cluster.py o-pad-with-gaps | |
mv $PROJECT_NAME-PADDED.fa $PROJECT_NAME.fa | |
RUN_MED | |
# prepare taxonomy files archive for easy access | |
rm -rf $PROJECT_NAME-MED-TAXONOMY | |
mkdir $PROJECT_NAME-MED-TAXONOMY | |
cp $PROJECT_NAME-MED/TAXONOMY* $PROJECT_NAME-MED-TAXONOMY | |
tar -zcf $PROJECT_NAME-MED-TAXONOMY.tar.gz $PROJECT_NAME-MED-TAXONOMY/ | |
rm -rf $PROJECT_NAME-MED-TAXONOMY | |
# clean crap | |
rm -rf *_MERGED *_MERGED_FINAL *_FAILED* *_MISMATCHES_B* *_phix.b6 | |
rm -rf mothur.* gast.log *.names *.unique.fa | |
## TOTAL RESET | |
#rm -rf *_MER* *_MIS* *_phix.b6* *_FAIL* *_STATS* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment