Skip to content

Instantly share code, notes, and snippets.

@rmrao
Created June 12, 2023 21:26
Show Gist options
  • Save rmrao/940fb8df30dcabda3f97d7633a9dd009 to your computer and use it in GitHub Desktop.
Save rmrao/940fb8df30dcabda3f97d7633a9dd009 to your computer and use it in GitHub Desktop.
Count the number of sequences or the total number of amino acids in a fasta file
#!/bin/bash
count_nseqs() {
file="$1"
if [ -f "$file" ]; then
if [[ "$file" == *.gz ]]; then
count=$(zcat "$file" | grep -c "^>")
else
count=$(grep -c "^>" "$file")
fi
echo "$count"
else
echo "File does not exist: $file" >&2
exit 1
fi
}
count_ntoks() {
file="$1"
if [ -f "$file" ]; then
if [[ "$file" == *.gz ]]; then
sum=0
zcat "$file" | awk '/^>/ { if (seq != "") { sum += length(seq) } seq = "" } /^>/ {next} { seq = seq $0 } END { sum += length(seq); print sum }'
else
sum=0
awk '/^>/ { if (seq != "") { sum += length(seq) } seq = "" } /^>/ {next} { seq = seq $0 } END { sum += length(seq); print sum }' "$file"
fi
else
echo "File does not exist: $file" >&2
exit 1
fi
}
# Main script
if [ $# -eq 0 ]; then
echo "Usage: ./fasta_length.sh [-ntok] <fasta_file>"
exit 1
fi
file_flag=""
fasta_file=""
# Parse command-line arguments
while [ "$1" != "" ]; do
case $1 in
-ntok ) file_flag="ntok"
;;
* ) fasta_file="$1"
esac
shift
done
if [ "$file_flag" == "ntok" ]; then
count_ntoks "$fasta_file"
else
count_nseqs "$fasta_file"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment