Skip to content

Instantly share code, notes, and snippets.

@standage
Last active December 16, 2015 11:29
Show Gist options
  • Save standage/5427359 to your computer and use it in GitHub Desktop.
Save standage/5427359 to your computer and use it in GitHub Desktop.
After running CEGMA on your genome assembly, this script will identify the KOGs (if any) that are not mapped in your genome.
#!/usr/bin/env bash
# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Default parameters
CEGMASRC=/usr/local/src/cegma
PREFIX=genome
SEQS=0
TMPDIR=/tmp
# Usage statement
print_usage()
{
cat <<EOF
Usage: $0 [options] cegma-output.gff
Options:
-c directory in which CEGMA source code it kept; default is
'${CEGMASRC}'
-h print this help message and exit
-p prefix for output files; default is '${PREFIX}'
-s print out sequences (in addition to IDs) for KOGs not mapped in the
given genome
-t directory in which to temporarily store intermediate output files;
default is '${TMPDIR}'
EOF
}
# Parse options and arguments from command line
while getopts "c:hp:st:" OPTION
do
case $OPTION in
c)
CEGMASRC=$OPTARG
;;
h)
print_usage
exit 0
;;
p)
PREFIX=$OPTARG
;;
s)
SEQS=1
;;
t)
TMP=$OPTARG
;;
esac
done
shift $((OPTIND-1))
if [[ $# != 1 ]]; then
echo -e "error: please CEGMA output file (GFF format)\n"
print_usage
exit 1
fi
CEGMAGFF=$1
# Find IDs of mapped ultraconserved CEGs/KOGs
cut -f 1 -d ' ' $CEGMASRC/data/completeness_cutoff.tbl | sort > $TMPDIR/cegma-all-ultraconserved-kogs-248.ids
cut -f 9 $CEGMAGFF | cut -f 1 -d '.' | sort | uniq > $TMPDIR/cegma-mapped-kogs.ids
comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids
comm -23 $TMPDIR/cegma-mapped-kogs.ids $TMPDIR/cegma-mapped-ultraconserved-kogs-complement.ids > $TMPDIR/cegma-mapped-ultraconserved-kogs.ids
echo -n 'Ultra-conserved KOGs mapped (out of 248): '
wc -l < $TMPDIR/cegma-mapped-ultraconserved-kogs.ids
# Find IDs (and, if requested, sequences) of unmapped ultraconserved CEGs/KOGs
comm -13 $TMPDIR/cegma-mapped-ultraconserved-kogs.ids $TMPDIR/cegma-all-ultraconserved-kogs-248.ids > $PREFIX.unmapped-ultraconserved-kogs.ids
perl -ne 'chomp; printf("%s\thttp://www.ncbi.nlm.nih.gov/COG/grace/shokog.cgi?%s\n", $_, $_)' < $PREFIX.unmapped-ultraconserved-kogs.ids > $PREFIX.unmapped-ultraconserved-kogs.txt
echo 'Unmapped KOGs:'
cat $PREFIX.unmapped-ultraconserved-kogs.ids
if [ $SEQS == 1 ]; then
SEQIDS=`tr '\n' ',' < $PREFIX.unmapped-ultraconserved-kogs.ids`
perl -e 'use strict; use Bio::SeqIO; my $idstr = shift(@ARGV); $idstr =~ s/,+$//; my $ids = {}; foreach my $id(split(/,/, $idstr)){ $ids->{$id} = 1; } my $l = Bio::SeqIO->new(-fh=>\*STDIN, -format=>"Fasta"); my $w = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>"Fasta"); while(my $s = $l->next_seq){ my($kog_id) = $s->id =~ m/___(KOG\d+)/; $w->write_seq($s) if($ids->{$kog_id}); }' $SEQIDS < $CEGMASRC/data/kogs.fa > $PREFIX.unmapped-ultraconserved-kogs.fa
echo -n "Sequences written to $PREFIX.unmapped-ultraconserved-kogs.fa: "
grep -c '^>' $PREFIX.unmapped-ultraconserved-kogs.fa
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment