Created
April 20, 2024 10:33
-
-
Save pansapiens/679eead24751cd1c9b0f0d411d43a4b6 to your computer and use it in GitHub Desktop.
Find likely adapter sequences in a set of paired end FASTQs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Given a directory of fastq files, use bbmerge to find adapters sequences for every sample, | |
# align and find the consensus. | |
# Final consensus is in r1_adapter.consensus.fa and r2_adapter.consensus.fa - trim by hand | |
# if the automatic trimming up the the first 'n' isn't sensible | |
# Requires: bbmerge.sh (bbmap), muscle, emboss (cons) | |
# MODIFY to match your R1/R2 suffix () | |
r1_suffix="_L2_1.fq.gz" | |
r1_tail="_1.fq.gz" | |
r2_tail="_2.fq.gz" | |
module load bbmap || true | |
for r1 in ${1}/*${r1_tail}; do | |
r2="${r1/${r1_tail}/${r2_tail}}" | |
bbmerge.sh in=${r1} in2=${r2} outa=adapters_bbmerge.$(basename ${r1} ${r1_suffix}).fa | |
done | |
echo -n "" >r1_adapters.fa | |
echo -n "" >r2_adapters.fa | |
for f in adapters_bbmerge.*.fa; do | |
head -n 2 $f >>r1_adapters.fa | |
tail -n 2 $f >>r2_adapters.fa | |
done | |
module load muscle || true | |
muscle -align r1_adapters.fa -output r1_adapters.muscle.fa | |
muscle -align r2_adapters.fa -output r2_adapters.muscle.fa | |
# from the EMBOSSS suite | |
module load emboss || true | |
# Find the consensus sequence | |
cons -sequence r1_adapters.muscle.fa -outseq r1_adapter.consensus.fa -name R1_adapter | |
cat r2_adapter.consensus.fa | |
# Turn multi-line FASTA into single line, output everything up to the first 'n' | |
echo "Trimmed:" | |
awk '/^>/ {if(seq) print seq; print; seq=""; next} {seq = seq $0} END {print seq}' r1_adapter.consensus.fa | sed 's/n\{1,\}.*//' | |
# Find the consensus sequence | |
cons -sequence r2_adapters.muscle.fa -outseq r2_adapter.consensus.fa -name R2_adapter | |
cat r2_adapter.consensus.fa | |
# Turn multi-line FASTA into single line, output everything up to the first 'n' | |
echo "Trimmed:" | |
awk '/^>/ {if(seq) print seq; print; seq=""; next} {seq = seq $0} END {print seq}' r2_adapter.consensus.fa | sed 's/n\{1,\}.*//' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment