Skip to content

Instantly share code, notes, and snippets.

@rknx
Last active July 5, 2023 20:11
Show Gist options
  • Save rknx/6ecfa30969dcb46a09a04c7bab4715d9 to your computer and use it in GitHub Desktop.
Save rknx/6ecfa30969dcb46a09a04c7bab4715d9 to your computer and use it in GitHub Desktop.
Fasta assembly metrics by assem_mets
##/usr/bin/env bash
# ------------------------------------------------------------------------------
# Script: Assembly metrics
# Author: Anuj Sharma
# Github: rknx/
# Email: rknx@outlook.com
# Description: This script is used to calculate various assembly metrics from
# a single multiFASTA file, often a genome assembly.
# Last modified: <2022/08/04>
# Copyright (C) 2022 Anuj Sharma (rknx@outlook.com)
# Permission to copy and modify is granted under the GPL license
# ------------------------------------------------------------------------------
# --- Default values -----------------------------------------------------------
VERSION=0.4.0
XLOG=
XTMP=
# --- Usage --------------------------------------------------------------------
read -r -d '' USAGE << EOF
USAGE
$(basename "$0") [options] [--] ... [infile]
DESCRIPTION
This script computes several assembly metrics for a multiFASTA file.
IMPLEMENTATION
version $(basename ${0%.sh}) v$VERSION
author Anuj Sharma
copyright Copyright © 2022 Anuj Sharma (rknx@outlook.com)
license GNU General Public License
OPTIONS
-h | --help
Display help text.
-v | --version
Display software version.
-o | --outfile []
Specify output file.
-s | --short
Do not display individual contig information.
-V | --verbose
Write debug messages to logfile.
EOF
# --- Options processing -------------------------------------------------------
# Return help if there are no arguments
if [ $# == 0 ] ; then echo "$USAGE"; exit 1; fi
# Pre-process arguments
ARGS=()
ENDOPT=
while [[ $# -gt 0 ]]; do
arg="$1"; shift
case "${ENDOPT}${arg}" in
--) ARGS+=("$arg"); ENDOPT=1 ;;
--*=*) ARGS+=("${arg%%=*}" "${arg#*=}") ;;
--*) ARGS+=("$arg") ;;
-*) for i in $(seq 2 ${#arg}); do ARGS+=("-${arg:i-1:1}"); done ;;
*) ARGS+=("$arg") ;;
esac
done
# Apply pre-processed options
set -- "${ARGS[@]}"
# Parse options
ENDOPT=
SHORT=
INFILES=()
while [[ $# -gt 0 ]]; do
case "${ENDOPT}${1}" in
# -x|--xxxxx) fun ;; # option
# -x|--xxxxx) shift; var="$1" ;; # one time argument with value
# -x|--xxxxx) shift; multivar+=("$1") ;; # repeated argument with values
-h|--help) echo "$USAGE" >&2; exit 0 ;;
-v|--version) echo "$(basename ${0%.sh}) v$VERSION" >&2; exit 0 ;;
-V|--verbose) export XLOG=1 ;;
-o|--outfile) shift; export OUTFILE="$1" ;;
-s|--short) export SHORT=1;;
--) ENDOPT=1 ;;
-*) echo "ERROR: Unrecognized argument: $1" >&2;
echo "$USAGE" >&2; exit 1 ;;
*) INFILE+=("$1") ;;
esac
shift
done
# Helper function for logging and outfile
source "$(dirname $(which "$0"))/logging.sh"
# Check if input is provided
[[ -z "$INFILE" ]] && \
__ts "No input FASTA file for computing metrics." && \
exit 1
# Check if multiple input is provided
[[ "${#INFILE[@]}" -gt "1" ]] && \
__ts "Multiple input files provided. Only first one is used." && \
INFILE=${INFILE[0]}
# Input information
__tsv "Processing file: $INFILE."
echo "Input file: $INFILE"
# Get lengths of contigs
__tsv "Calculating contig lengths."
arr=( $(
awk '
BEGIN{RS=">";ORS="\n";FS="\n"}
NR>1{$1 = ""; print gsub(/[aAtTuUwWgGcCsSrRyYkKmMbBdDhHvVnN]/, "")}
' "$INFILE" | \
sort -nr
) )
__tsv "Finished contig length."
# Get length of assembly
size=$((${arr[@]/%/+}0))
# Basic assembly information
__tsv "Calculating general assembly information."
echo -e "\n---- Assembly overview -------------------------"
echo "Total assembly length: $size"
echo "Number of contigs: ${#arr[@]}"
echo "Largest contig size: ${arr[0]}"
echo "Smallest contig size: ${arr[-1]}"
__tsv "Calculating GC percent."
grep -v ">" "$INFILE" | tr -d '\n' | \
awk '{
gcs = gsub(/[gGcCsS]/, "")
atuw = gsub(/[aAtTuUwW]/, "")
printf "GC content: %.2f%%\n", gcs * 100 / ( gcs + atuw )
}'
__tsv "Finished general assembly information."
# Assembly quality (N50 etc.)
__tsv "Calculating assembly quality information."
echo -e "\n---- Assembly quality --------------------------"
for i in 50 75 90
do
__tsv "Processing N$i."
awk -v t="$size" -vRS=" " -vf="$i" '{d++}(c+=$1)>=t*f/100 \
{printf "N%s: %s\tL%s: %s\n", f, $1, f, d; exit}' <<< ${arr[@]}
done
__tsv "Finished assembly quality information."
# End if short output
[[ "$SHORT" -eq 1 ]] && \
__tsv "Short output complete." && \
exit 1
# Contig size distribution
__tsv "Calculating contig size distribution."
echo -e "\n---- Contig size distribution ------------------"
for i in {3..6}
do
awk -vRS=" " -vf=$(( 10**$i )) '
$1 < f { printf ">%-10s\t%s contigs\t%s bp\n", f, d+0, c+0; exit }
{ d++; c+=$1 }
' <<< ${arr[@]}
done
__tsv "Finished contig size distribition."
# Individual contig information
__tsv "Calculating contig specific values."
echo -e "\n---- Contig information ------------------------"
awk -vRS=">" -vORS="\n" -vFS="\n" '
BEGIN {print "Name", "Length", "%GC"}
NR>1 {
split($1, j, " "); $1 = ""
atuw=gsub(/[aAtTuUwW]/, "")
gcs=gsub(/[gGcCsS]/, "")
rykmbdhvn=gsub(/[rRyYkKmMbBdDhHvVnN]/, "")
printf "%s %d %.2f\n", j[1], atuw+gcs+rykmbdhvn, gcs/(gcs+atuw)*100
}
' "$INFILE" | column -t
__tsv "Finished contig specific values."
__tsv "Assembly metric all done for $INFILE."
@rknx
Copy link
Author

rknx commented Jul 5, 2023

Error on N50 calculation in v0.3.0 is fixed in v0.4.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment