standage/cegma-missing-kogs.sh

## cegma-missing-kogs.sh
#!/usr/bin/env bash

# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# Default parameters
CEGMASRC=/usr/local/src/cegma
PREFIX=genome
SEQS=0
TMPDIR=/tmp

# Usage statement
print_usage()
{
  cat <<EOF
Usage: $0 [options] cegma-output.gff
  Options:
    -c    directory in which CEGMA source code it kept; default is
          '${CEGMASRC}'
    -h    print this help message and exit
    -p    prefix for output files; default is '${PREFIX}'
    -s    print out sequences (in addition to IDs) for KOGs not mapped in the
          given genome
    -t    directory in which to temporarily store intermediate output files;
          default is '${TMPDIR}'
EOF
}

# Parse options and arguments from command line
while getopts "c:hp:st:" OPTION
do
  case $OPTION in
    c)
      CEGMASRC=$OPTARG
      ;;
    h)
      print_usage
      exit 0
      ;;
    p)
      PREFIX=$OPTARG
      ;;
    s)
      SEQS=1
      ;;
    t)
      TMP=$OPTARG
      ;;
  esac
done
shift $((OPTIND-1))
if [[ $# != 1 ]]; then
  echo -e "error: please CEGMA output file (GFF format)\n"
  print_usage
  exit 1
fi
CEGMAGFF=$1

# Find IDs of mapped ultraconserved CEGs/KOGs
cut -f 1 -d ' ' $CEGMASRC/data/completeness_cutoff.tbl | sort > $TMPDIR/cegma-all-ultraconserved-kogs-248.ids
cut -f 9 $CEGMAGFF | cut -f 1 -d '.' | sort | uniq > $TMPDIR/cegma-mapped-kogs.ids
comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids
comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs.ids
echo -n 'Ultra-conserved KOGs mapped (out of 248): '
wc -l < $TMPDIR/cegma-mapped-ultraconserved-kogs.ids

# Find IDs (and, if requested, sequences) of unmapped ultraconserved CEGs/KOGs
comm -13 $TMPDIR/cegma-mapped-ultraconserved-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $PREFIX.unmapped-ultraconserved-kogs.ids
perl -ne 'chomp; printf("%s\thttp://www.ncbi.nlm.nih.gov/COG/grace/shokog.cgi?%s\n", $_, $_)' < $PREFIX.unmapped-ultraconserved-kogs.ids > $PREFIX.unmapped-ultraconserved-kogs.txt
echo 'Unmapped KOGs:'
cat $PREFIX.unmapped-ultraconserved-kogs.ids
if [ $SEQS == 1 ]; then
  SEQIDS=`tr '\n' ',' < $PREFIX.unmapped-ultraconserved-kogs.ids`
  perl -e 'use strict; use Bio::SeqIO; my $idstr = shift(@ARGV); $idstr =~ s/,+$//; my $ids = {}; foreach my $id(split(/,/, $idstr)){ $ids->{$id} = 1; } my $l = Bio::SeqIO->new(-fh=>\*STDIN, -format=>"Fasta"); my $w = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>"Fasta"); while(my $s = $l->next_seq){ my($kog_id) = $s->id =~ m/___(KOG\d+)/; $w->write_seq($s) if($ids->{$kog_id}); }' $SEQIDS < $CEGMASRC/data/kogs.fa > $PREFIX.unmapped-ultraconserved-kogs.fa
  echo -n "Sequences written to $PREFIX.unmapped-ultraconserved-kogs.fa: "
  grep -c '^>' $PREFIX.unmapped-ultraconserved-kogs.fa
fi
	#!/usr/bin/env bash

	# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	# Default parameters
	CEGMASRC=/usr/local/src/cegma
	PREFIX=genome
	SEQS=0
	TMPDIR=/tmp

	# Usage statement
	print_usage()
	{
	cat <<EOF
	Usage: $0 [options] cegma-output.gff
	Options:
	-c directory in which CEGMA source code it kept; default is
	'${CEGMASRC}'
	-h print this help message and exit
	-p prefix for output files; default is '${PREFIX}'
	-s print out sequences (in addition to IDs) for KOGs not mapped in the
	given genome
	-t directory in which to temporarily store intermediate output files;
	default is '${TMPDIR}'
	EOF
	}

	# Parse options and arguments from command line
	while getopts "c:hp:st:" OPTION
	do
	case $OPTION in
	c)
	CEGMASRC=$OPTARG
	;;
	h)
	print_usage
	exit 0
	;;
	p)
	PREFIX=$OPTARG
	;;
	s)
	SEQS=1
	;;
	t)
	TMP=$OPTARG
	;;
	esac
	done
	shift $((OPTIND-1))
	if [[ $# != 1 ]]; then
	echo -e "error: please CEGMA output file (GFF format)\n"
	print_usage
	exit 1
	fi
	CEGMAGFF=$1

	# Find IDs of mapped ultraconserved CEGs/KOGs
	cut -f 1 -d ' ' $CEGMASRC/data/completeness_cutoff.tbl \| sort > $TMPDIR/cegma-all-ultraconserved-kogs-248.ids
	cut -f 9 $CEGMAGFF \| cut -f 1 -d '.' \| sort \| uniq > $TMPDIR/cegma-mapped-kogs.ids
	comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids
	comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs.ids
	echo -n 'Ultra-conserved KOGs mapped (out of 248): '
	wc -l < $TMPDIR/cegma-mapped-ultraconserved-kogs.ids

	# Find IDs (and, if requested, sequences) of unmapped ultraconserved CEGs/KOGs
	comm -13 $TMPDIR/cegma-mapped-ultraconserved-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $PREFIX.unmapped-ultraconserved-kogs.ids
	perl -ne 'chomp; printf("%s\thttp://www.ncbi.nlm.nih.gov/COG/grace/shokog.cgi?%s\n", $_, $_)' < $PREFIX.unmapped-ultraconserved-kogs.ids > $PREFIX.unmapped-ultraconserved-kogs.txt
	echo 'Unmapped KOGs:'
	cat $PREFIX.unmapped-ultraconserved-kogs.ids
	if [ $SEQS == 1 ]; then
	SEQIDS=`tr '\n' ',' < $PREFIX.unmapped-ultraconserved-kogs.ids`
	perl -e 'use strict; use Bio::SeqIO; my $idstr = shift(@ARGV); $idstr =~ s/,+$//; my $ids = {}; foreach my $id(split(/,/, $idstr)){ $ids->{$id} = 1; } my $l = Bio::SeqIO->new(-fh=>\STDIN, -format=>"Fasta"); my $w = Bio::SeqIO->new(-fh=>\STDOUT, -format=>"Fasta"); while(my $s = $l->next_seq){ my($kog_id) = $s->id =~ m/___(KOG\d+)/; $w->write_seq($s) if($ids->{$kog_id}); }' $SEQIDS < $CEGMASRC/data/kogs.fa > $PREFIX.unmapped-ultraconserved-kogs.fa
	echo -n "Sequences written to $PREFIX.unmapped-ultraconserved-kogs.fa: "
	grep -c '^>' $PREFIX.unmapped-ultraconserved-kogs.fa
	fi