Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:56
Show Gist options
  • Save avrilcoghlan/5065559 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065559 to your computer and use it in GitHub Desktop.
Perl script that retrieves trees from the TreeFam database, and infers human within-species paralogs from the trees.
#!/usr/local/bin/perl
#
# Perl script find_human_paralogs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 31-Jul-07.
#
# For Christine Bird.
# This perl script reads in all trees in TreeFam-4
# and finds human within-species paralogs.
#
# The command-line format is:
# % perl <find_human_paralogs.pl>
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 0)
{
print "Usage of find_human_paralogs.pl\n\n";
print "perl find_human_paralogs.pl\n";
print "For example, >perl find_human_paralogs.pl\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
use Avril_modules;
use Treefam::DBConnection;
use DBI;
#------------------------------------------------------------------#
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION:
$VERBOSE = 0;
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES:
print STDERR "Finding TreeFam-A families... \n";
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TREEFAM-A FAMILIES:
$table_w = "familyA";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n";}
}
}
print STDERR "Finding TreeFam-B and TreeFam-C families... \n";
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES:
$table_w = "familyB";
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n";}
}
}
#------------------------------------------------------------------#
$dbc = Treefam::DBConnection->new();
# FIND PARALOGS WITHIN EACH FAMILY:
foreach my $treefam_family(@families)
{
print STDERR "Looking at family $treefam_family... \n";
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($treefam_family);
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
$no_families++;
print "\n$no_families: reading family $AC...\n";
# GET THE TREE FOR THE FAMILY:
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
# GET ALL THE HUMAN TRANSCRIPTS IN THIS FAMILY:
@human_nodes = &Avril_modules::empty_array(@human_nodes);
@human_genes = &Avril_modules::empty_array(@human_genes);
@nodes = $tree->get_leaves;
foreach $node(@nodes)
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
$species = $node->get_tag_value('S');
$species =~ tr/[a-z]/[A-Z]/;
if ($species eq 'HOMO SAPIENS' || $species eq 'HUMAN')
{
@human_nodes = (@human_nodes,$node);
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
@human_genes = (@human_genes,$gene);
}
}
# FOR EACH PAIR OF HUMAN GENES IN THE TREE, SEE
# WHEN THE DUPLICATION OCCURRED:
for ($i = 0; $i <= $#human_nodes; $i++)
{
$human_node_i = $human_nodes[$i];
$human_gene_i = $human_genes[$i];
for ($j = $i+1; $j <= $#human_nodes; $j++)
{
$human_node_j = $human_nodes[$j];
$human_gene_j = $human_genes[$j];
# FIND THE LAST COMMON ANCESTOR NODE OF $human_node_j AND $human_node_i:
$lca = $tree->get_last_common_ancestor($human_node_j,$human_node_i);
$lca_name = $lca->internalID;
# FIND THE TAXONOMIC LEVEL OF THE LAST COMMON ANCESTOR NODE:
$species = $lca->get_tag_value('S');
$species =~ tr/[a-z]/[A-Z]/;
# GET THE BOOTSTRAP OF THE LAST COMMON ANCESTOR NODE:
$bootstrap = $lca->get_tag_value('B');
# CHECK IF THE CHILDREN AND PARENT NODES OF $lca ARE SPECIATION NODES:
$speciations = 1;
# CHECK IF THE CHILDREN NODES OF $lca ARE DUPLICATION OR SPECIATION NODES:
foreach $child ($lca->children)
{
if (!($child->is_leaf)) # IF THIS IS NOT A SEQUENCE
{
$child_type = $child->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION
$child_name = $child->internalID;
if ($VERBOSE == 1) { print "lca_name $lca_name has child node $child_name of type $child_type family $AC\n";}
if ($child_type eq 'Y') { $speciations = 0;}
}
else
{
$child_name = $child->internalID;
if ($VERBOSE == 1) { print "lca_name $lca_name has child sequence $child_name family $AC\n";}
}
}
# CHECK IF THE PARENT NODE OF $lca IS A DUPLICATION OR SPECIATION NODE:
# NODE THAT $lca MAY NOT HAVE A PARENT NODE IF IT IS THE ROOT NODE IN THE TREE, eg. SEE TF106501 ROOT NODE.
if (defined($lca->parent))
{
$parent = $lca->parent;
$parent_name = $parent->internalID;
$parent_type = $parent->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION
if ($parent_type eq 'Y') { $speciations = 0;}
if ($VERBOSE == 1) { print "lca_name $lca_name has parent $parent_name of type $parent_type family $AC\n";}
}
# DECIDE IF WE WILL TAKE THIS CASE OR NOT. WE TAKE IT IF:
# (i) THE BOOTSTRAP FOR NODE $lca IS HIGH (>=70) OR
# (ii) THE PARENT AND CHILDREN NODES OF $lca ARE SPECIATION NODES
if ($bootstrap >= 70 || $speciations == 1)
{
print "TAKE: family $AC human genes $human_gene_i $human_gene_j ancestral_taxonomic_level=$species bootstrap=$bootstrap speciations=$speciations\n";
}
else
{
print "DISCARD: family $AC human genes $human_gene_i $human_gene_j ancestral_taxonomic_level=$species bootstrap=$bootstrap speciations=$speciations\n";
}
}
}
}
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment