Created
March 1, 2013 15:56
-
-
Save avrilcoghlan/5065559 to your computer and use it in GitHub Desktop.
Perl script that retrieves trees from the TreeFam database, and infers human within-species paralogs from the trees.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_human_paralogs.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 31-Jul-07. | |
# | |
# For Christine Bird. | |
# This perl script reads in all trees in TreeFam-4 | |
# and finds human within-species paralogs. | |
# | |
# The command-line format is: | |
# % perl <find_human_paralogs.pl> | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 0) | |
{ | |
print "Usage of find_human_paralogs.pl\n\n"; | |
print "perl find_human_paralogs.pl\n"; | |
print "For example, >perl find_human_paralogs.pl\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
#------------------------------------------------------------------# | |
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION: | |
$VERBOSE = 0; | |
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3 | |
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES: | |
print STDERR "Finding TreeFam-A families... \n"; | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET ALL THE TREEFAM-A FAMILIES: | |
$table_w = "familyA"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n";} | |
} | |
} | |
print STDERR "Finding TreeFam-B and TreeFam-C families... \n"; | |
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES: | |
$table_w = "familyB"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n";} | |
} | |
} | |
#------------------------------------------------------------------# | |
$dbc = Treefam::DBConnection->new(); | |
# FIND PARALOGS WITHIN EACH FAMILY: | |
foreach my $treefam_family(@families) | |
{ | |
print STDERR "Looking at family $treefam_family... \n"; | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($treefam_family); | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
$no_families++; | |
print "\n$no_families: reading family $AC...\n"; | |
# GET THE TREE FOR THE FAMILY: | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
# GET ALL THE HUMAN TRANSCRIPTS IN THIS FAMILY: | |
@human_nodes = &Avril_modules::empty_array(@human_nodes); | |
@human_genes = &Avril_modules::empty_array(@human_genes); | |
@nodes = $tree->get_leaves; | |
foreach $node(@nodes) | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
$species = $node->get_tag_value('S'); | |
$species =~ tr/[a-z]/[A-Z]/; | |
if ($species eq 'HOMO SAPIENS' || $species eq 'HUMAN') | |
{ | |
@human_nodes = (@human_nodes,$node); | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
@human_genes = (@human_genes,$gene); | |
} | |
} | |
# FOR EACH PAIR OF HUMAN GENES IN THE TREE, SEE | |
# WHEN THE DUPLICATION OCCURRED: | |
for ($i = 0; $i <= $#human_nodes; $i++) | |
{ | |
$human_node_i = $human_nodes[$i]; | |
$human_gene_i = $human_genes[$i]; | |
for ($j = $i+1; $j <= $#human_nodes; $j++) | |
{ | |
$human_node_j = $human_nodes[$j]; | |
$human_gene_j = $human_genes[$j]; | |
# FIND THE LAST COMMON ANCESTOR NODE OF $human_node_j AND $human_node_i: | |
$lca = $tree->get_last_common_ancestor($human_node_j,$human_node_i); | |
$lca_name = $lca->internalID; | |
# FIND THE TAXONOMIC LEVEL OF THE LAST COMMON ANCESTOR NODE: | |
$species = $lca->get_tag_value('S'); | |
$species =~ tr/[a-z]/[A-Z]/; | |
# GET THE BOOTSTRAP OF THE LAST COMMON ANCESTOR NODE: | |
$bootstrap = $lca->get_tag_value('B'); | |
# CHECK IF THE CHILDREN AND PARENT NODES OF $lca ARE SPECIATION NODES: | |
$speciations = 1; | |
# CHECK IF THE CHILDREN NODES OF $lca ARE DUPLICATION OR SPECIATION NODES: | |
foreach $child ($lca->children) | |
{ | |
if (!($child->is_leaf)) # IF THIS IS NOT A SEQUENCE | |
{ | |
$child_type = $child->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION | |
$child_name = $child->internalID; | |
if ($VERBOSE == 1) { print "lca_name $lca_name has child node $child_name of type $child_type family $AC\n";} | |
if ($child_type eq 'Y') { $speciations = 0;} | |
} | |
else | |
{ | |
$child_name = $child->internalID; | |
if ($VERBOSE == 1) { print "lca_name $lca_name has child sequence $child_name family $AC\n";} | |
} | |
} | |
# CHECK IF THE PARENT NODE OF $lca IS A DUPLICATION OR SPECIATION NODE: | |
# NODE THAT $lca MAY NOT HAVE A PARENT NODE IF IT IS THE ROOT NODE IN THE TREE, eg. SEE TF106501 ROOT NODE. | |
if (defined($lca->parent)) | |
{ | |
$parent = $lca->parent; | |
$parent_name = $parent->internalID; | |
$parent_type = $parent->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION | |
if ($parent_type eq 'Y') { $speciations = 0;} | |
if ($VERBOSE == 1) { print "lca_name $lca_name has parent $parent_name of type $parent_type family $AC\n";} | |
} | |
# DECIDE IF WE WILL TAKE THIS CASE OR NOT. WE TAKE IT IF: | |
# (i) THE BOOTSTRAP FOR NODE $lca IS HIGH (>=70) OR | |
# (ii) THE PARENT AND CHILDREN NODES OF $lca ARE SPECIATION NODES | |
if ($bootstrap >= 70 || $speciations == 1) | |
{ | |
print "TAKE: family $AC human genes $human_gene_i $human_gene_j ancestral_taxonomic_level=$species bootstrap=$bootstrap speciations=$speciations\n"; | |
} | |
else | |
{ | |
print "DISCARD: family $AC human genes $human_gene_i $human_gene_j ancestral_taxonomic_level=$species bootstrap=$bootstrap speciations=$speciations\n"; | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment