ConstantinoSchillebeeckx/vsearch_pick_open_reference_otus.sh

## vsearch_pick_open_reference_otus.sh
#!/bin/bash

echo "Vsearch started: $(date)";
echo "";

# generate all the proper directories
mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus

# must sort because searching done greedily
# see http://drive5.com/usearch/manual/uparseotu_algo.html
vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna;

# dereplicate reads
vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout;

# STEP 1
# search against Green Genes to generate closed ref OTUs centroids
# NOTE: the --db option will need to be updated specifically for your use
# NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ)
# remember to set SET MAX_REJECTS, etc
vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;


if [ -s step2_otus/closed_ref_fail.fna ]; then
    # STEP 2
    # randomly subsample 10% of failed closed ref reads
    # rename reads as New.ReferenceOTU
    # this will already be sorted by abundance since the input is sorted
    vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep;

    # denovo cluster failed closed ref subsample reads
    # this will serve as the reference database for new ref OTU
    vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4;

    # STEP 3
    # search step 2 failures against new ref DB
    # hits to DB are New.ReferenceOTU
    # failures are considered for New.CleanupReferenceOTU
    vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels  --maxaccepts 50 --maxrejects 50 --iddef 4;

    # STEP 4
    # denovo cluster of new ref failures
    # NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html)
    if [ -s step3_otus/new_ref_fail.fna ]; then
        vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep;
    fi

fi

# cat all OTU centroid files together for final searching against input reads
rm rep_set.fna;
cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna;
# OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna;

# final search of all input reads to OTU centroids for use in generating OTU biom table
vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4;

# generate OTU table
biom from-uc -i final.uc -o final.biom;
biom summarize-table -i final.biom -o final.log;

echo "Vsearch finished: $(date)";
echo "";


# filter out those OTUs present only in a single sample
# remove these reads from the rep_set as well
filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2
if [ -s final_ms2.biom ]; then
    biom summarize-table -i final_ms2.biom -o final_ms2.log;
    filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom


    # generate Newick tree
    parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75
    filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/
    make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre
fi
	#!/bin/bash

	echo "Vsearch started: $(date)";
	echo "";

	# generate all the proper directories
	mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus

	# must sort because searching done greedily
	# see http://drive5.com/usearch/manual/uparseotu_algo.html
	vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna;

	# dereplicate reads
	vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout;

	# STEP 1
	# search against Green Genes to generate closed ref OTUs centroids
	# NOTE: the --db option will need to be updated specifically for your use
	# NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ)
	# remember to set SET MAX_REJECTS, etc
	vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;


	if [ -s step2_otus/closed_ref_fail.fna ]; then
	# STEP 2
	# randomly subsample 10% of failed closed ref reads
	# rename reads as New.ReferenceOTU
	# this will already be sorted by abundance since the input is sorted
	vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep;

	# denovo cluster failed closed ref subsample reads
	# this will serve as the reference database for new ref OTU
	vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4;

	# STEP 3
	# search step 2 failures against new ref DB
	# hits to DB are New.ReferenceOTU
	# failures are considered for New.CleanupReferenceOTU
	vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;

	# STEP 4
	# denovo cluster of new ref failures
	# NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html)
	if [ -s step3_otus/new_ref_fail.fna ]; then
	vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep;
	fi

	fi

	# cat all OTU centroid files together for final searching against input reads
	rm rep_set.fna;
	cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna;
	# OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna;

	# final search of all input reads to OTU centroids for use in generating OTU biom table
	vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4;

	# generate OTU table
	biom from-uc -i final.uc -o final.biom;
	biom summarize-table -i final.biom -o final.log;

	echo "Vsearch finished: $(date)";
	echo "";


	# filter out those OTUs present only in a single sample
	# remove these reads from the rep_set as well
	filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2
	if [ -s final_ms2.biom ]; then
	biom summarize-table -i final_ms2.biom -o final_ms2.log;
	filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom


	# generate Newick tree
	parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75
	filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/
	make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre
	fi