Skip to content

Instantly share code, notes, and snippets.

@epaule
Created April 19, 2011 14:09
Show Gist options
  • Save epaule/927952 to your computer and use it in GitHub Desktop.
Save epaule/927952 to your computer and use it in GitHub Desktop.
Ensembl module for Bio::EnsEMBL::Analysis::Runnable::RepeatMasker
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::RepeatMasker
#
# Copyright (c) 2004 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::RepeatMasker
=head1 SYNOPSIS
my $repeat_masker = Bio::EnsEMBL::Analysis::Runnable::RepeatMasker->
new(
-query => $slice,
-program => 'repeatmasker',
-options => '-low'
);
$repeat_masker->run;
my @repeats = @{$repeat_masker->output};
=head1 DESCRIPTION
RepeatMasker expects to run the program RepeatMasker
and produce RepeatFeatures which can be stored in the repeat_feature
and repeat_consensus tables in the core database
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::RepeatMasker;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::Tools::Utilities qw(write_seqfile);
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
######################
#SETTING THE DEFAULTS#
######################
$self->program('RepeatMasker') if(!$self->program);
######################
return $self;
}
=head2 run_analysis
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::RepeatMasker
Arg [2] : string, program name
Function : constructs a commandline and runs the program passed
in, the generic method in Runnable isnt used as RepeatMasker doesnt
fit this module
Returntype: none
Exceptions: throws if run failed because sysetm doesnt
return 0 or the output file doesnt exist
Example :
=cut
ub run_analysis{
my ($self, $program) = @_;
if(!$program){
$program = $self->program;
}
throw($program." is not executable RepeatMasker::run_analysis ")
unless($program && -x $program);
my $cmd = $self->program." ";
$cmd .= $self->options." " if($self->options);
$cmd .= $self->queryfile;
print "Running analysis ".$cmd."\n";
system($cmd) == 0 or throw("FAILED to run ".$cmd.
" RepeatMasker:run_analysis");
foreach my $file(glob $self->queryfile."*"){
$self->files_to_delete($file);
}
if(! -e $self->resultsfile){
throw("FAILED to run repeatmasker on ".$self->queryfile." ".
$self->resultsfile." has not been produced ".
"RepeatMasker:run_analysis");
}
}
=head2 write_seq_file
Arg [1] : Bio::EnsEMBL::Analysis::Runnable
Arg [2] : Bio::Seq
Arg [3] : filename
Function : This uses Bio::SeqIO to dump a sequence to a fasta file
Returntype: string, filename
Exceptions: throw if failed to write sequence
Example :
=cut
sub write_seq_file{
my ($self, $seq, $filename) = @_;
if(!$seq){
$seq = $self->query;
}
if(!$filename){
$filename = $self->queryfile;
}
$seq->display_id('1'); # as the id is never parsed out anyway
$filename = write_seqfile($seq, $filename);
return $filename;
}
=head2 parse_results
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::RepeatMasker
Arg [2] : string, filename
Function : open and parse the results file into repeat
features
Returntype: none
Exceptions: throws on failure to open or close output file
Example :
=cut
sub parse_results{
my ($self, $results) = @_;
if(!$results){
$results = $self->resultsfile;
}
my $feature_factory = $self->feature_factory;
open(OUT, "<".$results) or throw("FAILED to open ".$results.
"RepeatMasker:parse_results");
REPEAT:while(<OUT>){
if (/no repetitive sequences detected/) {
print "RepeatMasker didn't find any repetitive sequences";
last REPEAT; #no repeats found no point carrying on
}
if(/only contains ambiguous bases/){
print "Sequence contains too many N's \n";
last REPEAT;
}
my @columns;
if(/\d+/){ #ignoring introductory lines
chomp;
@columns = split;
pop @columns if $columns[-1] eq '*';
#if (@columns != 15 || @columns != 14) {
# throw("Can't parse repeatmasker output unexpected number ".
# "of columns in the output ".@columns." in ".$_." ".
# "RepeatMasker:parse_results");
#}
my ($score, $name, $start, $end, $strand,
$repeat_name, $repeat_class, $repeatmunge, $repeatmunge2);
if(@columns == 15){
($score, $name, $start, $end, $strand,
$repeat_name, $repeat_class) = @columns[0, 4, 5, 6, 8, 9, 10];
}elsif(@columns == 14){
($score, $name, $start, $end, $strand,
$repeatmunge,$repeatmunge2) = @columns[0, 4, 5, 6, 8, 9, 10];
if ($repeatmunge =~ /(\S+)(LINE\S+)/) {
$repeatmunge =~ /(\S+)(LINE\S+)/;
$repeat_name = $1;
$repeat_class = $2;
} elsif ($repeatmunge2 eq 'Unknown') {
print "Unknown repeat name\n";
$repeat_name = 'Unknown';
} else {
throw("Can't parse repeatmasker output for line = $_\n");
}
if(!$repeat_class){
$repeat_class = 'UNK';
}
}else{
throw("Can't parse repeatmasker output unexpected number ".
"of columns in the output ".@columns." in ".$_." ".
"RepeatMasker:parse_results");
}
my $start_column;
if($strand eq '+'){
$start_column = 11;
$strand = 1;
}
if($strand eq 'C'){
$start_column = 13;
$strand = -1;
}#the location of the start and end inside the repeat
#is different depending on the strand
my $repeat_start = $columns[$start_column];
my $repeat_end = $columns[12];
if($repeat_end < $repeat_start){
my $temp_end = $repeat_start;
$repeat_start = $repeat_end;
$repeat_end = $temp_end;
}
my $rc = $feature_factory->create_repeat_consensus($repeat_name,
$repeat_class);
my $rf = $feature_factory->create_repeat_feature
($start, $end, $strand, $score,
$repeat_start, $repeat_end,
$rc, $self->query);
$self->output([$rf]);
}
}
close(OUT) or throw("FAILED to close ".$results.
"RepeatMasker:parse_results");
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment