Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065775 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065775 to your computer and use it in GitHub Desktop.
Perl script that reads in an alignment file corresponding to a HMM, and retrieves the positions of introns in the genes from the TreeFam database, and figures out the positions of introns with respect to the HMM columns.
#!/usr/local/bin/perl
#
# Perl script map_introns_to_HMM.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 26-Apr-07.
#
# For the TreeFam project.
#
# This perl script reads in an alignment file corresponding
# to a HMM:
# (i) gets the positions of introns in each gene, by
# reading them from the TreeFam mysql database,
# (ii) figures the positions of introns in the alignment,
# (iii) figures out the positions of introns in the HMM,
# by using the mapping file produced by hmmbuild.
#
# The command-line format is:
# % perl <map_introns_to_HMM.pl> aln map output
# where aln is the input alignment file,
# map is the mapping file produced by hmmbuild,
# output is the output file name.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of map_introns_to_HMM.pl\n\n";
print "perl map_introns_to_HMM.pl <aln> <map> <output>\n";
print "where <aln> is the input alignment file,\n";
print " <map> is the mapping file produced by hmmbuild,\n";
print " <output> is the output file name.\n";
print "For example, >perl map_introns_to_HMM.pl new_TF101002_fam.aln.pep\n";
print "new_TF101002_fam.aln.pep.map new_TF101002_fam.introns\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use Avril_modules;
use DBI;
# FIND THE NAME OF THE ALIGNMENT FILE:
$aln = $ARGV[0];
# FIND THE NAME OF THE MAPPING FILE:
$map = $ARGV[1];
# FIND THE OUTPUT FILE NAME:
$output = $ARGV[2];
open(OUTPUT,">$output") || die "ERROR: cannot open $output.\n";
$VERBOSE = 0;
#------------------------------------------------------------------#
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:
($genes,$ALN_POS,$no_genes)= &read_aln($aln);
# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:
$HMM_POS = &read_map($map);
# GET THE TREEFAM IDs FOR THESE GENES:
$TREEFAM_ID = &get_treefam_ids($genes);
# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:
$all_introns = &get_intron_positions($genes,$TREEFAM_ID,$ALN_POS,$HMM_POS);
# PRINT OUT THE INTRON POSITION FILE:
&print_intron_file($all_introns,$no_genes);
close(OUTPUT);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# SORT INTRONS BY POSITION IN THE HMM:
sub by_pos
{
$POS{$a} <=> $POS{$b}
or
$a cmp $b;
}
#------------------------------------------------------------------#
# PRINT OUT THE INTRON POSITION FILE:
sub print_intron_file
{
my $all_introns = $_[0];
my $no_genes = $_[1];
my @introns;
my $intron;
my $i;
my @temp;
my $pos;
my $phase;
%POS = ();
my %SEEN = ();
my %COUNT = ();
my $count;
my @introns2;
my $fraction;
my $last_intron;
my $last_pos;
my $intron0;
my $intron1;
my $intron2;
my $count0;
my $count1;
my $count2;
my $frac0;
my $frac1;
my $frac2;
if ($no_genes <= 0) { print STDERR "ERROR: no_genes $no_genes.\n"; exit;}
print OUTPUT "# intron position file\n";
print OUTPUT "# columns: position phase 0 phase 1 phase 2\n";
if ($all_introns ne '')
{
$all_introns = substr($all_introns,1,length($all_introns)-1);
@introns = split(/\,/,$all_introns);
for ($i = 0; $i <= $#introns; $i++)
{
$intron = $introns[$i];
@temp = split(/_/,$intron);
$pos = $temp[0]; # POSITION OF INTRON WRT HMMER HMM.
$POS{$intron} = $pos;
if (!($SEEN{$intron}))
{
$SEEN{$intron} = 1;
$COUNT{$intron} = 1;
@introns2 = (@introns2,$intron);
}
else
{
$COUNT{$intron}++;
}
}
@introns2 = sort by_pos @introns2;
$last_intron = $introns2[$#introns2];
@temp = split(/_/,$last_intron);
$last_pos = $temp[0];
# PRINT OUT THE INTRONS AT EACH POSITION IN THE HMM:
for ($i = 1; $i <= $last_pos; $i++)
{
$intron0 = $i."_0";
$intron1 = $i."_1";
$intron2 = $i."_2";
if ($COUNT{$intron0} || $COUNT{$intron1} || $COUNT{$intron2})
{
if ($COUNT{$intron0}) { $count0 = $COUNT{$intron0};} else { $count0 = 0;}
if ($COUNT{$intron1}) { $count1 = $COUNT{$intron1};} else { $count1 = 0;}
if ($COUNT{$intron2}) { $count2 = $COUNT{$intron2};} else { $count2 = 0;}
$frac0 = $count0/$no_genes;
$frac1 = $count1/$no_genes;
$frac2 = $count2/$no_genes;
print OUTPUT "$i $frac0 $frac1 $frac2\n";
}
}
}
else
{
print OUTPUT "no introns\n";
}
}
#------------------------------------------------------------------#
# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM:
sub read_map
{
my $map = $_[0];
my $line;
my @temp;
my $aln;
my $length;
my $i;
my $char;
my $aln_pos = 0;
my $hmm_pos = 0;
my %HMM_POS = ();
open(MAP,"$map") || die "ERROR: cannot open $map.\n";
while(<MAP>)
{
$line = $_;
chomp $line;
if ($line =~ /#=GC RF/)
{
@temp = split(/\s+/,$line);
$aln = $temp[2];
$length = length($aln);
for ($i = 0; $i < $length; $i++)
{
$char = substr($aln,$i,1);
$aln_pos++;
if ($char eq 'x')
{
$hmm_pos++;
}
# POSITION $aln_pos IN THE ALIGNMENT MAPS TO POSITION $hmm_pos IN THE HMM:
if ($HMM_POS{$aln_pos}) { print STDERR "ERROR: already have hmm_pos for $aln_pos.\n"; exit;}
$HMM_POS{$aln_pos} = $hmm_pos;
}
}
}
close(MAP);
return(\%HMM_POS);
}
#------------------------------------------------------------------#
# GET THE TREEFAM IDs FOR THE GENES:
sub get_treefam_ids
{
my $genes = $_[0];
my %ID = ();
my $i;
my $gene;
my $dbh;
my $table_w;
my $st;
my $sth;
my $rv;
my $id;
my @array;
my $rc;
# FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'genes';
for ($i = 0; $i < @$genes; $i++)
{
$gene = $genes->[$i];
$st = "SELECT ID from $table_w WHERE GID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0]; # eg., ENSMUST00000049178.2
$ID{$gene} = $id;
}
}
if (!($ID{$gene}))
{
$st = "SELECT ID from $table_w WHERE TID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0]; # eg., ENSMUST00000049178.2
$ID{$gene} = $id;
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
return(\%ID);
}
#------------------------------------------------------------------#
# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES:
sub get_intron_positions
{
my $genes = $_[0];
my $TREEFAM_ID = $_[1];
my $ALN_POS = $_[2];
my $HMM_POS = $_[3];
my $i;
my $gene;
my $dbh;
my $table_w;
my $st;
my $sth;
my $rv;
my $id;
my @array;
my $rc;
my $strand;
my $start;
my $stop;
my $introns;
my $all_introns = "";
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS:
$table_w = 'map';
for ($i = 0; $i < @$genes; $i++)
{
$gene = $genes->[$i];
if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know id for gene $gene.\n"; exit;}
$id = $TREEFAM_ID->{$gene};
$st = "SELECT STRAND,START,STOP from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$strand = $array[0]; # eg., +
$start = $array[1]; # eg., 144225,144806,145261,145497,
$stop = $array[2]; # eg., 144318,145216,145444,145804,
# FIND THE INTRONS IN THIS GENE:
$introns = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS,$HMM_POS);
if ($introns ne "") { $all_introns = $all_introns.",".$introns;}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
return($all_introns);
}
#------------------------------------------------------------------#
# INFER THE INTRON POSITIONS IN A GENE:
sub infer_intron_positions
{
my $strand = $_[0];
my $start = $_[1]; # 144225,144806,145261,145497,
my $stop = $_[2]; # 144318,145216,145444,145804,
my $gene = $_[3];
my $ALN_POS = $_[4];
my $HMM_POS = $_[5];
my @start = split(/\,/,$start);
my @stop = split(/\,/,$stop);
my $i;
my $exon_start;
my $exon_stop;
my $coding_length;
my $hangover;
my $intron_phase;
my $prev_aa_pos = 0;
my @temp;
my $aln_pos;
my $hmm_pos;
my $intron;
my $introns = "";
if ($#start != $#stop) { print STDERR "ERROR: start $#start stop $#stop.\n"; exit;}
if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";}
if ($strand eq '+')
{
$coding_length = 0;
for ($i = 0; $i <= $#start; $i++)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length/3;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = $coding_length % 3;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != $#start)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
{
$hmm_pos = $HMM_POS->{$aln_pos};
$intron = $hmm_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n"; }
}
}
}
}
elsif ($strand eq '-')
{
$coding_length = 0;
for ($i = $#start; $i >= 0; $i--)
{
# FIND THE COORDINATES OF EXON $i:
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB.
$exon_stop = $stop[$i];
# CALCULATE THE CODING LENGTH SO FAR:
$coding_length = $coding_length + ($exon_stop - $exon_start + 1);
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID:
$prev_aa_pos = $coding_length/3;
@temp = split(/\./,$prev_aa_pos);
$prev_aa_pos = $temp[0];
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS:
$hangover = $coding_length % 3;
if ($hangover == 0) { $intron_phase = 0;}
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1).
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2).
if ($i != 0)
{
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT:
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;}
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos};
if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS.
{
$hmm_pos = $HMM_POS->{$aln_pos};
$intron = $hmm_pos."_".$intron_phase;
$introns = $introns.",".$intron;
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n";}
}
}
}
}
else
{
print STDERR "ERROR: strand $strand.\n"; exit;
}
if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);}
return($introns);
}
#------------------------------------------------------------------#
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS:
sub read_aln
{
my $aln = $_[0];
my $line;
my @temp;
my $gene;
my @genes;
my $length;
my $i;
my $char;
my $prot_pos;
my $aln_pos;
my %ALN_POS = ();
my $no_genes = 0;
open(ALN,"$aln") || die "ERROR: cannot open $aln.\n";
while(<ALN>)
{
$line = $_;
chomp $line;
if (substr($line,0,1) eq '>')
{
@temp = split(/\s+/,$line);
$gene = $temp[0];
$gene = substr($gene,1,length($gene)-1);
@genes = (@genes,$gene);
$no_genes++;
$prot_pos = 0;
$aln_pos = 0;
print "xxx gene $gene\n";
}
else
{
$length = length($line);
for ($i = 0; $i < $length; $i++)
{
$char = substr($line,$i,1);
$aln_pos++;
if ($char ne '-')
{
$prot_pos++;
if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;}
$ALN_POS{$gene."_".$prot_pos} = $aln_pos;
}
print "xxx gene $gene aln_pos $aln_pos prot_pos $prot_pos\n";
}
}
}
close(ALN);
return(\@genes,\%ALN_POS,$no_genes);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment