Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065591 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065591 to your computer and use it in GitHub Desktop.
Perl script that, for a particular TreeFam family of interest (that has human paralogs), prints out the DNA alignment for the family, with the position of introns shown with respect to the DNA alignment.
#!/usr/local/bin/perl
#
# Perl script find_human_paralogs_alns.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 29-Aug-07.
#
# For Christine Bird.
# This gets the DNA alignments for TreeFam families
# that have within-species paralogs.
#
# The command-line format is:
# % perl <find_human_paralogs_alns.pl> <families> <the_family>
# where <families> is the file with the families to take,
# <the_family> is the family we are interested in.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of find_human_paralogs_alns.pl\n\n";
print "perl find_human_paralogs_alns.pl <families> <the_family>\n";
print "where <families> is the file with the families to take,\n";
print " <the_family> is the family we are interested in.\n";
print "For example, >perl find_human_paralogs_alns.pl\n";
print "human_paralogs_3aug07 TF101011\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/";
use Avril_modules;
use Treefam::DBConnection;
use DBI;
# FIND THE FILE WITH THE FAMILIES TO TAKE:
$families = $ARGV[0];
# FIND THE FAMILY WE ARE INTERESTED IN:
$the_family = $ARGV[1];
#------------------------------------------------------------------#
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION:
$VERBOSE = 0;
$dbc = Treefam::DBConnection->new();
%SEEN = ();
# READ IN THE LIST OF FAMILIES THAT WE ARE INTERESTED IN:
open(FAMILIES,"$families") || die "ERROR: cannot open $families\n";
while(<FAMILIES>)
{
$line = $_;
chomp $line;
if ($line =~ /TAKE:/)
{
@temp = split(/\s+/,$line);
$family = $temp[2];
if ($SEEN{$family} || $family ne $the_family) { goto NEXT_FAMILY;}
$SEEN{$family} = 1;
print "==================================================================\n";
print "==================================================================\n";
print "FAMILY $family:\n";
$famh = $dbc->get_FamilyHandle();
$family1 = $famh->get_by_id($family);
$tree = $family1->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
# GET THE ALIGNMENT FOR THIS FAMILY:
$aln = $tree->get_alignment('nt');
# MAP THE INTRON POSITIONS ONTO THE ALIGNMENT FOR THIS FAMILY:
&map_introns_to_alns($aln,$family);
}
NEXT_FAMILY:
}
close(FAMILIES);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# MAP THE INTRON POSITIONS ONTO THE ALIGNMENT FOR THIS FAMILY:
sub map_introns_to_alns
{
my $aln = $_[0];
my $family = $_[1];
my @temp;
my $i;
my $gene;
my $line;
my @temp2;
my @genes;
my $all_introns;
my %ALN = ();
# FIND THE NAMES OF THE GENES IN THE ALIGNMENT:
@temp = split(/\n+/,$aln);
for ($i = 0; $i <= $#temp; $i++)
{
$line = $temp[$i];
if (substr($line,0,1) eq '>')
{
@temp2 = split(/\s+/,$line);
$gene = $temp2[0];
$gene = substr($gene,1,length($gene)-1);
@genes = (@genes,$gene);
}
else
{
if (!($ALN{$gene})) { $ALN{$gene} = $line."\n"; }
else { $ALN{$gene} = $ALN{$gene}.$line."\n";}
}
}
# GET THE TREEFAM GENE NAMES FOR THE TRANSCRIPTS:
$TREEFAM_ID = &get_treefam_ids(\@genes);
# READ THE ALIGNMENT, TO GET THE POSITIONS OF EACH AMINO ACID:
($ALN_POS) = &read_aln($aln);
# GET THE POSITIONS OF INTRONS IN THE GENES:
$all_introns = &get_intron_positions(\@genes,$ALN_POS);
# PRINT OUT THE ALIGNMENT WITH THE INTRONS SHOWN:
&print_aln($aln,$all_introns,\@genes,\%ALN,$TREEFAM_ID);
}
#------------------------------------------------------------------#
# PRINT OUT THE ALIGNMENT WITH THE INTRONS SHOWN:
sub print_aln
{
my $aln = $_[0];
my $introns = $_[1];
my $genes = $_[2];
my $ALN = $_[3];
my $TREEFAM_ID = $_[4];
my $length;
my $i;
my $j;
my $gene;
my $gene_aln;
my $k;
my @temp;
my $line;
my $char;
my @introns = split(/\,/,$introns);
my %INTRON = ();
my %POS = ();
my $pos;
my $geneid;
# RECORD WHERE TO PUT INTRONS:
for ($i = 0; $i <= $#introns; $i++)
{
$intron = $introns[$i];
@temp = split(/_/,$intron);
$gene = $temp[0];
$pos = $temp[1];
$INTRON{$gene."_".$pos} = 1;
}
# FIND OUT THE NUMBER OF LINES IN THE ALIGNMENT:
if (!($ALN->{$genes->[0]})) { print STDERR "ERROR: no alignment for $genes->[0]\n"; exit;}
$gene_aln = $ALN->{$genes->[0]};
@temp = split(/\n+/,$gene_aln);
$length = $#temp + 1;
# PRINT OUT THE ALIGNMENT LINE BY LINE:
for ($i = 0; $i < $length; $i++)
{
# PRINT OUT EACH GENE:
for ($j = 0; $j < @$genes; $j++)
{
$gene = $genes->[$j];
if (substr($gene,0,4) ne 'ENST') { goto NEXT_GENE1;}
if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know gene name for $gene\n"; exit;}
$geneid = $TREEFAM_ID->{$gene};
if (!($ALN->{$gene})) { print STDERR "ERROR: no aln for $gene\n"; exit;}
$gene_aln = $ALN->{$gene};
@temp = split(/\n+/,$gene_aln);
if ($i <= $#temp)
{
$line = $temp[$i];
if ($line ne '')
{
for ($k = 1; $k <= length($line); $k++)
{
$char = substr($line,$k-1,1);
if ($char ne '')
{
print "$char";
if (!($POS{$gene})) { $POS{$gene} = 1;}
else { $POS{$gene}++; }
$pos = $POS{$gene};
if ($INTRON{$gene."_".$pos})
{
print "*";
}
else
{
print " ";
}
}
}
}
print " : $geneid\n";
}
NEXT_GENE1:
}
print "\n";
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:
sub read_aln
{
my $aln = $_[0];
my $line;
my @temp;
my $gene;
my @genes;
my $length;
my $i;
my $char;
my $prot_pos;
my $aln_pos;
my %ALN_POS = ();
my $no_genes = 0;
my @aln = split(/\n+/,$aln);
my $j;
for ($j = 0; $j <= $#aln; $j++)
{
$line = $aln[$j];
chomp $line;
if (substr($line,0,1) eq '>')
{
@temp = split(/\s+/,$line);
$gene = $temp[0];
$gene = substr($gene,1,length($gene)-1);
@genes = (@genes,$gene);
$no_genes++;
$prot_pos = 0;
$aln_pos = 0;
}
else
{
$length = length($line);
for ($i = 0; $i < $length; $i++)
{
$char = substr($line,$i,1);
$aln_pos++;
if ($char ne '-')
{
$prot_pos++;
if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;}
$ALN_POS{$gene."_".$prot_pos} = $aln_pos;
}
}
}
}
return(\%ALN_POS);
}
#------------------------------------------------------------------#
# GET THE TREEFAM IDs FOR THE GENES:
sub get_treefam_ids
{
my $genes = $_[0];
my %ID = ();
my $i;
my $gene;
my $dbh;
my $table_w;
my $st;
my $sth;
my $rv;
my $id;
my @array;
my $rc;
my $gid;
# FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'genes';
for ($i = 0; $i < @$genes; $i++)
{
$gene = $genes->[$i];
$st = "SELECT ID,GID from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0]; # eg., ENSMUST00000049178.2
$gid = $array[1];
$ID{$gene} = $gid;
}
}
if (!($ID{$gene}))
{
$st = "SELECT ID from $table_w WHERE TID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0]; # eg., ENSMUST00000049178.2
$ID{$gene} = $id;
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
return(\%ID);
}
#------------------------------------------------------------------#
# GET THE POSITIONS OF INTRONS IN THE GENES:
sub get_intron_positions
{
my $genes = $_[0];
my $ALN_POS = $_[1];
my $dbh;
my $table_w;
my $i;
my $gene;
my $id;
my $st;
my $sth;
my $rv;
my @array;
my $strand;
my $start;
my $stop;
my $introns;
my $all_introns = "";
my $rc;
my $cstart;
my $cstop;
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'map';
for ($i = 0; $i < @$genes; $i++)
{
$gene = $genes->[$i];
if (substr($gene,0,4) ne 'ENST') { goto NEXT_GENE;} # ONLY LOOK AT HUMAN GENES.
$id = $gene;
$st = "SELECT STRAND,START,STOP,C_START,C_STOP from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$strand = $array[0]; # eg., +
$start = $array[1]; # eg., 144225,144806,145261,145497,
$stop = $array[2]; # eg., 144318,145216,145444,145804,
$cstart = $array[3];
$cstop = $array[4];
$start = &remove_noncoding($start,$cstart,$cstop,"start");
$stop = &remove_noncoding($stop,$cstart,$cstop,"stop");
# FIND THE INTRONS IN THIS GENE:
$introns = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS);
if ($introns ne "") { $all_introns = $all_introns.",".$introns;}
}
}
NEXT_GENE:
}
$rc = $dbh->disconnect();
$rc = "";
$all_introns = substr($all_introns,1,length($all_introns)-1);
return($all_introns);
}
#------------------------------------------------------------------#
# SUBROUTINE TO REMOVE NON-CODING REGIONS FROM THE EXONS:
sub remove_noncoding
{
my $exons = $_[0];
my $cstart = $_[1];
my $cstop = $_[2];
my $type = $_[3];
my $new_exons = "";
chop($exons);
my @exons = split(/\,/,$exons);
my $i;
my $seen_start = 0;
my $seen_end = 0;
for ($i = 0; $i <= $#exons; $i++)
{
if ($exons[$i] >= $cstart && $exons[$i] <= $cstop)
{
if ($i == 0) { $seen_start = 1;}
elsif ($i == $#exons) { $seen_end = 1;}
$new_exons = $new_exons.",".$exons[$i];
if ($exons[$i] == $cstart && $seen_start == 0) { $seen_start = 1;}
if ($exons[$i] == $cstop && $seen_end == 0) { $seen_end = 1;}
}
elsif ($exons[$i] <= $cstart)
{
# ignore this exon.
}
elsif ($exons[$i] >= $cstop)
{
# ignore this exon.
}
}
$new_exons = substr($new_exons,1,length($new_exons)-1);
if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart.",".$new_exons;}
if ($seen_end == 0 && $type eq 'stop') { $new_exons = $new_exons.",".$cstop; }
if (substr($new_exons,0,1) eq ',') { $new_exons = substr($new_exons,1,length($new_exons)-1);}
if (substr($new_exons,length($new_exons)-1,1) eq ',') { chop($new_exons); }
return($new_exons);
}
#------------------------------------------------------------------#
# INFER THE INTRON POSITIONS IN A GENE:
sub infer_intron_positions
{
my $strand = $_[0];
my $start = $_[1]; # 144225,144806,145261,145497,
my $stop = $_[2]; # 144318,145216,145444,145804,
my $gene = $_[3];
my $ALN_POS = $_[4];
my @start = split(/\,/,$start);
my @stop = split(/\,/,$stop);
my $i;
my $exon_start;
my $exon_stop;
my $coding_length;
my $hangover;
my $intron_phase;
my $prev_aa_pos = 0;
my @temp;
my $aln_pos;
my $hmm_pos;
my $intron;
my $introns = "";
if ($#start != $#stop)
{
print STDERR "ERROR: gene $gene : start $#start ($start) stop $#stop ($stop).\n";
exit;
}
if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
if ($VERBOSE == 1) { print "gene $gene : start $start stop $stop\n";}
if ($strand eq '+')
{
$coding_length = 0;
for ($i = 0; $i <= $#start; $i++)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = 0;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != $#start)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
$intron = $gene."_".$aln_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
}
}
}
elsif ($strand eq '-')
{
$coding_length = 0;
for ($i = $#start; $i >= 0; $i--)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = 0;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != 0)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
$intron = $gene."_".$aln_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";}
}
}
}
else
{
print STDERR "ERROR: strand $strand.\n"; exit;
}
if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}
return($introns);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment