Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064814 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064814 to your computer and use it in GitHub Desktop.
Perl script that uses TreeFam to find cases where two adjacent genes in a species (eg. Caenorhabditis elegans) should probably be merged, as one of them has its best match to the first part of a TreeFam family alignment, and the second has its best match to the second part of the same TreeFam family's alignment.
#!/usr/local/bin/perl
#
# Perl script find_gene_pred_errors1.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 25-Feb-09.
#
# For the TreeFam project.
#
# This perl script finds cases where two adjacent genes in a species
# (eg. C. elegans) should probably be merged, as one of them has its
# best match to the first part of a TreeFam family alignment and the
# second has its best match to the second part of the same TreeFam
# family's alignment.
#
# The command-line format is:
# % perl <find_gene_pred_errors1.pl> species release
# where species is the species of interest,
# release is the release of TreeFam to use.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of find_gene_pred_errors1.pl\n\n";
print "perl find_gene_pred_errors1.pl <species> <release>\n";
print "where <species> is the species of interest (eg. C. elegans),\n";
print " <release> is the release of TreeFam to use.\n";
print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n";
exit;
}
use DBI;
# FIND THE NAME OF THE SPECIES OF INTEREST:
$the_species = $ARGV[0];
# FIND THE RELEASE OF TREEFAM TO USE:
$release = $ARGV[1];
#------------------------------------------------------------------#
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
($GID,$IDS) = &get_genes_from_species($dbh,$the_species);
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES:
($BEST) = &read_hmmer_matches_to_families($dbh,$GID);
# FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE
# THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
&find_adjacent_genes($dbh,$GID,$IDS,$BEST);
# NOW DISCONNECT FROM THE DATABASE:
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY
# HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
sub find_adjacent_genes
{
my $dbh = $_[0];
my $GID = $_[1];
my $IDS = $_[2];
my $BEST = $_[3];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my $start;
my $chr;
my %GENES = ();
my $GID;
my %SEEN = ();
%START = ();
my $genes;
my @genes;
my $i;
my $gene;
my $prev_gene;
my $gene_ids;
my $prev_gene_ids;
my @gene_ids;
my @prev_gene_ids;
my $prev_gene_id;
my $gene_id;
my $j;
my $k;
my $prev_gene_id_best;
my $gene_id_best;
foreach $ID (keys %{$GID})
{
$GID = $GID{$ID};
$start = "none";
$chr = "none";
$table_w = 'map';
$st = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$start = $array[0];
$chr = $array[1];
}
}
if ($start eq 'none' || $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;}
# RECORD THE LOCATION OF GENE $GID:
if (!($SEEN{$chr."=".$GID}))
{
$SEEN{$chr."=".$GID} = 1;
if (!($GENES{$chr}))
{
$GENES{$chr} = $GID;
}
else
{
$GENES{$chr} = $GENES{$chr}.",".$GID;
}
if (!($START{$GID}))
{
$START{$GID} = $start;
}
else
{
if ($start < $START{$GID})
{
$START{$GID} = $start;
}
}
}
}
# CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY:
foreach $chr (keys %GENES)
{
$genes = $GENES{$chr};
@genes = split(/\,/,$genes);
@genes = sort by_start @genes;
for ($i = 1; $i <= $#genes; $i++)
{
$gene = $genes[$i];
$prev_gene = $genes[$i-1];
# CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES
# TO THE SAME TREEFAM FAMILY:
if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;}
if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;}
$gene_ids = $IDS->{$gene};
$prev_gene_ids = $IDS->{$prev_gene};
@gene_ids = split(/\,/,$gene_ids);
@prev_gene_ids = split(/\,/,$prev_gene_ids);
for ($j = 0; $j <= $#gene_ids; $j++)
{
$gene_id = $gene_ids[$j];
if (!($BEST->{$gene_id})) { $gene_id_best = "none"; }
else { $gene_id_best = $BEST->{$gene_id};}
for ($k = 0; $k <= $#prev_gene_ids; $k++)
{
$prev_gene_id = $prev_gene_ids[$k];
if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none"; }
else { $prev_gene_id_best = $BEST->{$prev_gene_id};}
if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none')
{
print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n";
}
}
}
}
}
}
#------------------------------------------------------------------#
# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
sub get_genes_from_species
{
my $dbh = $_[0];
my $the_species = $_[1];
my %GID = ();
my $taxid = "none";
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my $GID;
my %IDS = ();
# FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species:
$table_w = 'species';
$st = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$taxid = $array[0];
}
}
if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}
# NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species:
$table_w = 'genes';
$st = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$GID = $array[1];
if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";}
$GID{$ID} = $GID;
# RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE:
if (!($IDS{$GID})) { $IDS{$GID} = $ID;}
else {$IDS{$GID} = $IDS{$GID}.",".$ID;}
}
}
return(\%GID,\%IDS);
}
#------------------------------------------------------------------#
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species
# AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT:
sub read_hmmer_matches_to_families
{
my $dbh = $_[0];
my $GID = $_[1];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $ID;
my %BEST = ();
my %BEST_EVALUE = ();
my %BEST_SCORE = ();
my $best;
my $best_evalue;
my $best_score;
foreach $ID (keys %{$GID})
{
$table_w = 'hmmer_matches';
$st = "SELECT AC, SCORE, EVALUE from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
$SCORE = $array[1];
$EVALUE = $array[2];
if (!($BEST{$ID}))
{
if ($EVALUE == 0) { $EVALUE = "zero";}
if ($SCORE == 0) { $SCORE = "zero";}
$BEST{$ID} = $AC;
$BEST_EVALUE{$ID} = $EVALUE;
$BEST_SCORE{$ID} = $SCORE;
}
else
{
$best = $BEST{$ID};
$best_evalue= $BEST_EVALUE{$ID};
$best_score = $BEST_SCORE{$ID};
if ($best_evalue eq 'zero') { $best_evalue = 0;}
if ($best_score eq 'zero') { $best_score = 0;}
if ($EVALUE < $best_evalue || ($EVALUE == $best_evalue && $SCORE > $best_score))
{
if ($EVALUE == 0) { $EVALUE = "zero";}
if ($SCORE == 0) { $SCORE = "zero";}
$BEST{$ID} = $AC;
$BEST_EVALUE{$ID} = $EVALUE;
$BEST_SCORE{$ID} = $SCORE;
}
}
}
}
}
return(\%BEST);
}
#------------------------------------------------------------------#
# SUBROUTINE TO SORT GENES BY THEIR START-POINT:
sub by_start
{
if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;}
if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;}
$START{$a} <=> $START{$b}
or
$a cmp $b;
}
#------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment