ShaiberAlon/marker_gene_workflow.sh

## marker_gene_workflow.sh
#!/bin/bash

# CALL THIS SCRIPT WITH A PROJECT NAME PARAMETER

set -e

if [ "$#" -ne 1 ]; then
    echo "You need to give a project name as an argument to this script (and it better be a single, short and descriptive word without funny characters)."
    exit -1
fi

PROJECT_NAME=$1

if [ ! -f "samples.txt"  ]
then
    echo "
        You must have a samples.txt in this directory. It should simply contain
        all the sample names.
        "
    exit 1
fi

CHECK_FILE () {
    if [ -s $1 ]
    then
        continue
    else
        echo "Error: Expected to find $1, but it is not there!"
        exit 1
    fi
}

RUN_MED () {
    if [ -s "metadata.txt" ]
    then
        clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10 --sample-mapping metadata.txt
    else
        clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10
    fi
    /groups/merenlab/00_RESOURCES/wait_for_cluster.py decompose

    CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta
    TMP_FILE="$PROJECT_NAME-MED/NODE-REPRESENTATIVES-TMP.fa"
    cp $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta $TMP_FILE
    sed -i 's/|.*$//g' $TMP_FILE
    sed -i 's/-//g' $TMP_FILE
    clusterize -n 4 gast -in $TMP_FILE -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast -termg
    /groups/merenlab/00_RESOURCES/wait_for_cluster.py gast
    CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast
    rm -rf $TMP_FILE

    python /groups/merenlab/00_RESOURCES/create_taxonomy_matrices.py $PROJECT_NAME-MED
    CHECK_FILE $PROJECT_NAME-MED/TAXONOMY-GENUS.txt
}

LOOP_OVER_SAMPLES () {
    # $1; command to be executed per sample
    # $2; file name to be found once the cluster is done with the job
    # $3; wait_for_cluster keyword

    # run the command
    for sample in `cat samples.txt`
    do
       eval $1
    done

    # wait for the cluster to be done
    if [ -z $3 ]
    then
        continue
    else
        /groups/merenlab/00_RESOURCES/wait_for_cluster.py $3
    fi

    # if there is an expected output file, look for it
    if [ -z $2 ]
    then
        continue
    else
        for sample in `cat samples.txt`
        do
            fname=$(eval "echo $2")
            if [ -s $fname ]
            then
                continue
            else
                echo "Error: Expected to find $fname, but it is not there!"
                exit 1
            fi
        done
    fi
}

############### THINS START HERE ######################

LOOP_OVER_SAMPLES 'clusterize iu-merge-pairs $sample.ini --max-num-mismatches 2 --min-overlap-size 45' '"$sample"_MERGED' 'iu-merge-pairs'
LOOP_OVER_SAMPLES 'clusterize blastn -query "$sample"_MERGED -db /groups/merenlab/00_RESOURCES/phiX/phix.fa -outfmt 6 -out "$sample"_phix.b6 -perc_identity 90 -evalue 0.000001' '"$sample"_phix.b6' 'blastn'
LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/phiX/remove_phix.py "$sample"_MERGED "$sample"_phix.b6' '"$sample"_MERGED_FINAL' 'remove_phix.py'
LOOP_OVER_SAMPLES 'mv "$sample"_MERGED_FINAL $sample.fa'

for sample in `cat samples.txt`; do sed -i 's/|.*$//g' $sample.fa; done

#LOOP_OVER_SAMPLES 'clusterize -n 10 gast -in $sample.fa -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $sample.gast -termg' '$sample.gast' 'gast'
#LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/annotate_fasta_with_gast.py -i $sample.fa -g $sample.gast --overwrite-input' '$sample.fa' 'annotate_fasta_with_gast.py'

rm -rf $PROJECT_NAME.fa
for sample in `cat samples.txt`; do cat $sample.fa >> $PROJECT_NAME.fa; done
o-get-sample-info-from-fasta $PROJECT_NAME.fa > $PROJECT_NAME.info

clusterize o-pad-with-gaps $PROJECT_NAME.fa -o $PROJECT_NAME-PADDED.fa
/groups/merenlab/00_RESOURCES/wait_for_cluster.py o-pad-with-gaps
mv $PROJECT_NAME-PADDED.fa $PROJECT_NAME.fa

RUN_MED

# prepare taxonomy files archive for easy access
rm -rf $PROJECT_NAME-MED-TAXONOMY
mkdir $PROJECT_NAME-MED-TAXONOMY
cp $PROJECT_NAME-MED/TAXONOMY* $PROJECT_NAME-MED-TAXONOMY
tar -zcf $PROJECT_NAME-MED-TAXONOMY.tar.gz $PROJECT_NAME-MED-TAXONOMY/
rm -rf $PROJECT_NAME-MED-TAXONOMY


# clean crap
rm -rf *_MERGED *_MERGED_FINAL *_FAILED* *_MISMATCHES_B* *_phix.b6
rm -rf mothur.* gast.log *.names *.unique.fa

## TOTAL RESET
#rm -rf *_MER* *_MIS* *_phix.b6* *_FAIL* *_STATS*
	#!/bin/bash

	# CALL THIS SCRIPT WITH A PROJECT NAME PARAMETER

	set -e

	if [ "$#" -ne 1 ]; then
	echo "You need to give a project name as an argument to this script (and it better be a single, short and descriptive word without funny characters)."
	exit -1
	fi

	PROJECT_NAME=$1

	if [ ! -f "samples.txt" ]
	then
	echo "
	You must have a samples.txt in this directory. It should simply contain
	all the sample names.
	"
	exit 1
	fi

	CHECK_FILE () {
	if [ -s $1 ]
	then
	continue
	else
	echo "Error: Expected to find $1, but it is not there!"
	exit 1
	fi
	}

	RUN_MED () {
	if [ -s "metadata.txt" ]
	then
	clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10 --sample-mapping metadata.txt
	else
	clusterize -n 10 decompose $PROJECT_NAME.fa -o $PROJECT_NAME-MED --number-of-threads 10
	fi
	/groups/merenlab/00_RESOURCES/wait_for_cluster.py decompose

	CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta
	TMP_FILE="$PROJECT_NAME-MED/NODE-REPRESENTATIVES-TMP.fa"
	cp $PROJECT_NAME-MED/NODE-REPRESENTATIVES.fasta $TMP_FILE
	sed -i 's/\|.*$//g' $TMP_FILE
	sed -i 's/-//g' $TMP_FILE
	clusterize -n 4 gast -in $TMP_FILE -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast -termg
	/groups/merenlab/00_RESOURCES/wait_for_cluster.py gast
	CHECK_FILE $PROJECT_NAME-MED/NODE-REPRESENTATIVES.gast
	rm -rf $TMP_FILE

	python /groups/merenlab/00_RESOURCES/create_taxonomy_matrices.py $PROJECT_NAME-MED
	CHECK_FILE $PROJECT_NAME-MED/TAXONOMY-GENUS.txt
	}

	LOOP_OVER_SAMPLES () {
	# $1; command to be executed per sample
	# $2; file name to be found once the cluster is done with the job
	# $3; wait_for_cluster keyword

	# run the command
	for sample in `cat samples.txt`
	do
	eval $1
	done

	# wait for the cluster to be done
	if [ -z $3 ]
	then
	continue
	else
	/groups/merenlab/00_RESOURCES/wait_for_cluster.py $3
	fi

	# if there is an expected output file, look for it
	if [ -z $2 ]
	then
	continue
	else
	for sample in `cat samples.txt`
	do
	fname=$(eval "echo $2")
	if [ -s $fname ]
	then
	continue
	else
	echo "Error: Expected to find $fname, but it is not there!"
	exit 1
	fi
	done
	fi
	}

	############### THINS START HERE ######################

	LOOP_OVER_SAMPLES 'clusterize iu-merge-pairs $sample.ini --max-num-mismatches 2 --min-overlap-size 45' '"$sample"_MERGED' 'iu-merge-pairs'
	LOOP_OVER_SAMPLES 'clusterize blastn -query "$sample"_MERGED -db /groups/merenlab/00_RESOURCES/phiX/phix.fa -outfmt 6 -out "$sample"_phix.b6 -perc_identity 90 -evalue 0.000001' '"$sample"_phix.b6' 'blastn'
	LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/phiX/remove_phix.py "$sample"_MERGED "$sample"_phix.b6' '"$sample"_MERGED_FINAL' 'remove_phix.py'
	LOOP_OVER_SAMPLES 'mv "$sample"_MERGED_FINAL $sample.fa'

	for sample in `cat samples.txt`; do sed -i 's/\|.*$//g' $sample.fa; done

	#LOOP_OVER_SAMPLES 'clusterize -n 10 gast -in $sample.fa -ref /xraid2-2/g454/blastdbs/gast_distributions/refssu.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refssu.tax -out $sample.gast -termg' '$sample.gast' 'gast'
	#LOOP_OVER_SAMPLES 'clusterize /groups/merenlab/00_RESOURCES/annotate_fasta_with_gast.py -i $sample.fa -g $sample.gast --overwrite-input' '$sample.fa' 'annotate_fasta_with_gast.py'

	rm -rf $PROJECT_NAME.fa
	for sample in `cat samples.txt`; do cat $sample.fa >> $PROJECT_NAME.fa; done
	o-get-sample-info-from-fasta $PROJECT_NAME.fa > $PROJECT_NAME.info

	clusterize o-pad-with-gaps $PROJECT_NAME.fa -o $PROJECT_NAME-PADDED.fa
	/groups/merenlab/00_RESOURCES/wait_for_cluster.py o-pad-with-gaps
	mv $PROJECT_NAME-PADDED.fa $PROJECT_NAME.fa

	RUN_MED

	# prepare taxonomy files archive for easy access
	rm -rf $PROJECT_NAME-MED-TAXONOMY
	mkdir $PROJECT_NAME-MED-TAXONOMY
	cp $PROJECT_NAME-MED/TAXONOMY* $PROJECT_NAME-MED-TAXONOMY
	tar -zcf $PROJECT_NAME-MED-TAXONOMY.tar.gz $PROJECT_NAME-MED-TAXONOMY/
	rm -rf $PROJECT_NAME-MED-TAXONOMY


	# clean crap
	rm -rf _MERGED _MERGED_FINAL _FAILED _MISMATCHES_B *_phix.b6
	rm -rf mothur.* gast.log .names .unique.fa

	## TOTAL RESET
	#rm -rf _MER _MIS _phix.b6 _FAIL _STATS