Skip to content

Instantly share code, notes, and snippets.

@sestaton
Last active September 1, 2016 17:08
Show Gist options
  • Save sestaton/751e4b352d6b8471d4708a4e09190589 to your computer and use it in GitHub Desktop.
Save sestaton/751e4b352d6b8471d4708a4e09190589 to your computer and use it in GitHub Desktop.
Run tephra on Arabidopsis thaliana
#!/bin/bash
# set this to the location of muscle and vmatch
export PATH=`pwd`:$PATH
set -euo pipefail
script=$(basename $0)
function usage() {
cat <<EOF
USAGE: $script <repeat_db> <threads> <repeat_hmm>
repeat_db : A (nucleotide) FASTA file database of repeats, such as RepBase.
threads : The number of parallel process to use for computations (Default: 1).
repeat_hmm : An HMM file (HMMERv3 format) of repeat domains for classification (optional).
EOF
}
function print_error() {
cat <<ERR
ERROR: Command line not parsed correctly. Check input.
ERR
}
function get_genome() {
for i in $(seq 1 5);
do
curl -o TAIR10_chr${i}.fas \
-sL ftp://ftp.arabidopsis.org/home/tair/Sequences/whole_chromosomes/TAIR10_chr${i}.fas
done
genome=TAIR10_chr1-5.fas
cat TAIR10_chr[1-5].fas > $genome
rm TAIR10_chr[1-5].fas
echo "$genome"
}
function get_findltrs_config() {
curl -o tephra_ltr_config_arab.yml \
-sL https://gist.githubusercontent.com/sestaton/29506e2b5048440de74a146848b5b869/raw/4762ac547f52d5f5da2ff486f20da45bb0716493/tephra_ltr_config_arab.yml
echo "tephra_ltr_config_arab.yml"
}
if [ $# -lt 1 ]; then
print_error
usage
exit 1
fi
## main program
repdb=$1
threads=$2
genome=$(get_genome)
config=$(get_findltrs_config)
base=$(echo ${genome%.*})
## LTRs
time tephra findltrs \
-g $genome \
-d $hmmdb \
-t $trnas \
-o ${base}_tephra_ltrs.gff3 \
-c $config \
--clean
time tephra classifyltrs \
-g $genome \
-d $repdb \
-t $threads \
-f ${base}_tephra_ltrs.gff3 \
-o ${base}_classified_ltrs
time tephra maskref \
-g $genome \
-d ${base}_classified_ltrs/${base}_combined_LTR_families.fasta \
-o ${base}_masked.fas
## solo-LTRs
time tephra sololtr -i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_copia \
-g ${base}_masked.fas \
-o ${base}_masked_copia_sololtrs.gff3 \
-r ${base}_masked_copia_sololtr_rep.tsv \
-s ${base}_masked_copia_sololtr_seqs.fas \
-t $threads
time tephra sololtr -i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_gypsy \
-g ${base}_masked.fas \
-o ${base}_masked_copia_sololtrs.gff3 \
-r ${base}_masked_gypsy_sololtr_rep.tsv \
-s ${base}_masked_gypsy_sololtr_seqs.fas \
-t $threads
## ltrage
time tephra ltrage -g $genome \
-t $threads \
-o ${base}_ltrages_all.tsv \
-f TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_families.gff3 \
--all \
--clean
time tephra ltrage -g $genome \
-t $threads \
-i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_copia \
-o ${base}_copia_ltrages_exemp.tsv \
--clean
time tephra ltrage -g $genome \
-t $threads \
-i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_unclassified \
-o ${base}_unclassified_ltrages.tsv \
-f TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_families.gff3 \
--clean
## illrecomb
time tephra illrecomb -i ${base}_classified_ltrs/${base}_tephra_ltrs_copia \
-o ${base}_masked_copia_illrecomb.fas \
-r ${base}_masked_copia_illrecomb_rep.tsv \
-s ${base}_masked_copia_illrecomb_stats.tsv \
-t $threads
time tephra illrecomb -i ${base}_classified_ltrs/${base}_tephra_ltrs_gypsy \
-o ${base}_masked_gypsy_illrecomb.fas \
-r ${base}_masked_gypsy_illrecomb_rep.tsv \
-s ${base}_masked_gypsy_illrecomb_stats.tsv \
-t $threads
## TRIMs
time tephra findtrims \
-g ${base}_masked.fas \
-d $hmms \
-t $trnas
time tephra maskref \
-g ${base}_masked.fas \
-d ${base}_masked_trim_ltrdigest85_combined_filtered.fasta \
-o ${base}_masked2.fas
## Helitrons
time tephra findhelitrons \
-g ${base}_masked2.fas \
-o ${base}_masked2_helitrons.gff3
time tephra maskref \
-g ${base}_masked2.fas \
-d ${base}_masked2_tephra_hscan_helitrons.hel.fa \
-o ${base}_masked3.fas
## TIR elements
time tephra findtirs \
-g ${base}_masked.fas \
-d $hmms \
-o ${base}_masked_tirs.gff3
time tephra classifytirs \
-g ${base}_masked.fas \
-f ${base}_masked_tirs_filtered.gff3
time tephra maskref \
-d ${base}_masked3_tirs.fasta \
-g ${base}_masked3.fas \
-o ${base}_masked4.fas
## non-LTRs
time tephra findnonltrs \
-g ${base}_masked4.fas
time tephra maskref \
-g ${base}_masked_masked_masked.fas \
-d nonLTRs_out/nonLTRs_out_tephra_nonltr.fasta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment