avrilcoghlan/get_treefam_family_seqs2.pl

## get_treefam_family_seqs2.pl
#!/usr/local/bin/perl

#
# Perl script get_treefam_family_seqs.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 19-Nov-08.
# Updated 19-Nov-08.
#
# For the TreeFam project.
#
# This perl script prints out all the protein sequences
# for a TreeFam-6 family.
#
# The command-line format is:
# % perl <get_treefam_family_seqs.pl> family
# where family is the family identifier.
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 1)
{
   print "Usage of get_treefam_family_seqs.pl\n\n";
   print "perl get_treefam_family_seqs.pl <family>\n";
   print "where <family> is the family identifier.\n";
   print "For example, >perl -w get_treefam_family_seqs.pl TF101073\n";
   exit;
}

use DBI;
use lib "/home/bcri/acoghlan/perlmodulesAvril/";
use Treefam::DBConnection;

# FIND THE NAME OF THE INPUT FAMILY:

$family_id                 = $ARGV[0];

$release                   = 6; # THE TREEFAM RELEASE WE WANT TO USE.

#------------------------------------------------------------------#

# GET ALL THE GENES IN THIS FAMILY:

$dbc                       =  Treefam::DBConnection->new();
$famh                      = $dbc->get_FamilyHandle();
$family                    = $famh->get_by_id($family_id);
if (!(defined($family)))
{
  print "Do not have $family_id in the database...\n";
  exit;
}

my $tree                   = $family->get_tree('clean');
# FIRST PRINT ALL THE SEQUENCES IN THE FAMILY:
my @leaves                 = $tree->get_leaves;
%SEEN                      = ();
foreach my $leaf(@leaves)
{
   if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE
   # A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025
   {
      my $id               = $leaf->sequence_id();
      if (!($SEEN{$id}))
      {
   $SEEN{$id}        = 1;
         @ids              = (@ids,$id);
      }
   }
}

# GET THE AMINO ACID SEQUENCES FOR ALL THE TRANSCRIPTS IN THE FAMILY:
$database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh                       = DBI->connect("$database", 'anonymous', '') || return;
$table_w                   = 'aa_seq';

for ($i = 0; $i <= $#ids; $i++)
{
   $id                     = $ids[$i];
   $st                     = "SELECT SEQ from $table_w WHERE ID=?";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $seq                 = $array[0];
	 # REMOVE STOP CODONS ('*') FROM THE END OF THE SEQUENCE:
	 if (substr($seq,length($seq)-1,1) eq '*')
	 {
	    chop($seq);
	 }
	 print ">$id\n";
	 print "$seq\n";
      }
   }
   else
   {
      print STDERR "ERROR: did not find seq for $id in the database\n";
      exit;
   }
}
$rc                        = $dbh->disconnect();
$rc                        = "";


#------------------------------------------------------------------#

print STDERR "FINISHED.\n";
	#!/usr/local/bin/perl

	#
	# Perl script get_treefam_family_seqs.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk)
	# 19-Nov-08.
	# Updated 19-Nov-08.
	#
	# For the TreeFam project.
	#
	# This perl script prints out all the protein sequences
	# for a TreeFam-6 family.
	#
	# The command-line format is:
	# % perl <get_treefam_family_seqs.pl> family
	# where family is the family identifier.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of get_treefam_family_seqs.pl\n\n";
	print "perl get_treefam_family_seqs.pl <family>\n";
	print "where <family> is the family identifier.\n";
	print "For example, >perl -w get_treefam_family_seqs.pl TF101073\n";
	exit;
	}

	use DBI;
	use lib "/home/bcri/acoghlan/perlmodulesAvril/";
	use Treefam::DBConnection;

	# FIND THE NAME OF THE INPUT FAMILY:

	$family_id = $ARGV[0];

	$release = 6; # THE TREEFAM RELEASE WE WANT TO USE.

	#------------------------------------------------------------------#

	# GET ALL THE GENES IN THIS FAMILY:

	$dbc = Treefam::DBConnection->new();
	$famh = $dbc->get_FamilyHandle();
	$family = $famh->get_by_id($family_id);
	if (!(defined($family)))
	{
	print "Do not have $family_id in the database...\n";
	exit;
	}

	my $tree = $family->get_tree('clean');
	# FIRST PRINT ALL THE SEQUENCES IN THE FAMILY:
	my @leaves = $tree->get_leaves;
	%SEEN = ();
	foreach my $leaf(@leaves)
	{
	if (defined($leaf->sequence_id())) # FOR SOME LEAVES, WE DON'T SEE TO HAVE
	# A SEQUENCE ID, eg. SEE RICE GENE IN http://www.treefam.org/cgi-bin/TFinfo.pl?ac=TF101025
	{
	my $id = $leaf->sequence_id();
	if (!($SEEN{$id}))
	{
	$SEEN{$id} = 1;
	@ids = (@ids,$id);
	}
	}
	}

	# GET THE AMINO ACID SEQUENCES FOR ALL THE TRANSCRIPTS IN THE FAMILY:
	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;
	$table_w = 'aa_seq';

	for ($i = 0; $i <= $#ids; $i++)
	{
	$id = $ids[$i];
	$st = "SELECT SEQ from $table_w WHERE ID=?";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$seq = $array[0];
	# REMOVE STOP CODONS ('*') FROM THE END OF THE SEQUENCE:
	if (substr($seq,length($seq)-1,1) eq '*')
	{
	chop($seq);
	}
	print ">$id\n";
	print "$seq\n";
	}
	}
	else
	{
	print STDERR "ERROR: did not find seq for $id in the database\n";
	exit;
	}
	}
	$rc = $dbh->disconnect();
	$rc = "";


	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";