Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5064939 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064939 to your computer and use it in GitHub Desktop.
Perl script that prints out all the protein sequences in a particular TreeFam family
#!/usr/local/bin/perl
#
# Perl script get_treefam_family_seqs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 19-Nov-08.
# Updated 19-Nov-08.
#
# For the TreeFam project.
#
# This perl script prints out all the protein sequences
# for a TreeFam-6 family.
#
# The command-line format is:
# % perl <get_treefam_family_seqs.pl> family
# where family is the family identifier.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of get_treefam_family_seqs.pl\n\n";
print "perl get_treefam_family_seqs.pl <family>\n";
print "where <family> is the family identifier.\n";
print "For example, >perl -w get_treefam_family_seqs.pl TF101073\n";
exit;
}
use DBI;
use lib "/home/bcri/acoghlan/perlmodulesAvril/";
use Treefam::DBConnection;
# FIND THE NAME OF THE INPUT FAMILY:
$family_id = $ARGV[0];
$release = 6; # THE TREEFAM RELEASE WE WANT TO USE.
#------------------------------------------------------------------#
# GET ALL THE GENES IN THIS FAMILY:
$dbc = Treefam::DBConnection->new();
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
exit;
}
my $tree = $family->get_tree('clean');
# FIRST PRINT ALL THE SEQUENCES IN THE FAMILY:
my @leaves = $tree->get_leaves;
%SEEN = ();
foreach my $leaf(@leaves)
{
if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE
# A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025
{
my $id = $leaf->sequence_id();
if (!($SEEN{$id}))
{
$SEEN{$id} = 1;
@ids = (@ids,$id);
}
}
}
# GET THE AMINO ACID SEQUENCES FOR ALL THE TRANSCRIPTS IN THE FAMILY:
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$table_w = 'aa_seq';
for ($i = 0; $i <= $#ids; $i++)
{
$id = $ids[$i];
$st = "SELECT SEQ from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$seq = $array[0];
# REMOVE STOP CODONS ('*') FROM THE END OF THE SEQUENCE:
if (substr($seq,length($seq)-1,1) eq '*')
{
chop($seq);
}
print ">$id\n";
print "$seq\n";
}
}
else
{
print STDERR "ERROR: did not find seq for $id in the database\n";
exit;
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment