Created
March 1, 2013 14:18
-
-
Save avrilcoghlan/5064939 to your computer and use it in GitHub Desktop.
Perl script that prints out all the protein sequences in a particular TreeFam family
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script get_treefam_family_seqs.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 19-Nov-08. | |
# Updated 19-Nov-08. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script prints out all the protein sequences | |
# for a TreeFam-6 family. | |
# | |
# The command-line format is: | |
# % perl <get_treefam_family_seqs.pl> family | |
# where family is the family identifier. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of get_treefam_family_seqs.pl\n\n"; | |
print "perl get_treefam_family_seqs.pl <family>\n"; | |
print "where <family> is the family identifier.\n"; | |
print "For example, >perl -w get_treefam_family_seqs.pl TF101073\n"; | |
exit; | |
} | |
use DBI; | |
use lib "/home/bcri/acoghlan/perlmodulesAvril/"; | |
use Treefam::DBConnection; | |
# FIND THE NAME OF THE INPUT FAMILY: | |
$family_id = $ARGV[0]; | |
$release = 6; # THE TREEFAM RELEASE WE WANT TO USE. | |
#------------------------------------------------------------------# | |
# GET ALL THE GENES IN THIS FAMILY: | |
$dbc = Treefam::DBConnection->new(); | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
exit; | |
} | |
my $tree = $family->get_tree('clean'); | |
# FIRST PRINT ALL THE SEQUENCES IN THE FAMILY: | |
my @leaves = $tree->get_leaves; | |
%SEEN = (); | |
foreach my $leaf(@leaves) | |
{ | |
if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE | |
# A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025 | |
{ | |
my $id = $leaf->sequence_id(); | |
if (!($SEEN{$id})) | |
{ | |
$SEEN{$id} = 1; | |
@ids = (@ids,$id); | |
} | |
} | |
} | |
# GET THE AMINO ACID SEQUENCES FOR ALL THE TRANSCRIPTS IN THE FAMILY: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$table_w = 'aa_seq'; | |
for ($i = 0; $i <= $#ids; $i++) | |
{ | |
$id = $ids[$i]; | |
$st = "SELECT SEQ from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$seq = $array[0]; | |
# REMOVE STOP CODONS ('*') FROM THE END OF THE SEQUENCE: | |
if (substr($seq,length($seq)-1,1) eq '*') | |
{ | |
chop($seq); | |
} | |
print ">$id\n"; | |
print "$seq\n"; | |
} | |
} | |
else | |
{ | |
print STDERR "ERROR: did not find seq for $id in the database\n"; | |
exit; | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment