Last active
August 29, 2015 14:12
-
-
Save rocky/6083dedc752c197875ca to your computer and use it in GitHub Desktop.
VEP speedup testing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/_Inline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
perl overlap-bench.pl | |
Rate overlap overlap_C | |
overlap 136/s -- -49% | |
overlap_C 267/s 96% -- | |
Rate intron O intron O faster intron O C | |
intron O 27.7/s -- -42% -76% | |
intron O faster 47.5/s 72% -- -58% | |
intron O C 114/s 311% 139% -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sub overlap { | |
my ($f1_start, $f1_end, $f2_start, $f2_end) = @_; | |
return ( ($f1_end >= $f2_start) and ($f1_start <= $f2_end) ); | |
} | |
sub intron { | |
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_; | |
if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or | |
overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or | |
overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or | |
overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or | |
($insertion && ( | |
$vf_start == $intron_start || | |
$vf_end == $intron_end || | |
$vf_start == $intron_start+2 || | |
$vf_end == $intron_end-2 | |
) )) { | |
return 1; | |
}; | |
}; | |
sub intron_faster { | |
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_; | |
# Can reduce cost by computing x = vf_end - intron_end and | |
# comparing to x. | |
if ($vf_end < $intron_start-3) { | |
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ); | |
} | |
# Can reduce cost by computing y = vf_start - intron_end and | |
# comparing to y. | |
if ($vf_start > $intron_end+3) { | |
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7); | |
} | |
# assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3 | |
# Can reduce cost by computing z = vf_start - intron_end and | |
# comparing to z. | |
if ($vf_start > $intron_start+7) { | |
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2); | |
} | |
# assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7] | |
if ($vf_end < $intron_end-7) { | |
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7); | |
} | |
return ($vf_start <= $intron_start-1) or | |
($vf_end >= $intron_start+2) or | |
($vf_start <= $intron_end-2) or | |
($vf_end >= $intron_end+1) or | |
($insertion && ( | |
$vf_start == $intron_start || | |
$vf_end == $intron_end || | |
$vf_start == $intron_start+2 || | |
$vf_end == $intron_end-2 | |
); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Benchmark qw(:all); | |
use Inline C; | |
# my $Iterations = 100_000; | |
my $iterations = 500; | |
# From Bio::EnsEMBL::Variation::Utils::VariationEffect::overlap | |
sub overlap { | |
my ( $f1_start, $f1_end, $f2_start, $f2_end ) = @_; | |
return ( ($f1_end >= $f2_start) and ($f1_start <= $f2_end) ); | |
} | |
# From inside Bio::EnsEMBL::Variation::BaseTranscriptVariation::_intron_effects | |
sub intron_overlap { | |
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_; | |
if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or | |
overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or | |
overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or | |
overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or | |
($insertion && ( | |
$vf_start == $intron_start || | |
$vf_end == $intron_end || | |
$vf_start == $intron_start+2 || | |
$vf_end == $intron_end-2 | |
) )) { | |
return 1; | |
}; | |
}; | |
# Speed up intron_overlap. We do this by unrolling the two parts of the interval test | |
# in overlap and use knowlege of parts of the compare in subsequent comparisons. | |
# | |
# Below is a table to show how comparison results are be reused | |
# vs ve is ie ve < is-3 ve < is+2 ve < ie-7 ve < ie+1 vs == is vs == is+2 | |
# <=1 1 (>=4)* >=is true* true false,true if ie>7 true false false | |
## | |
# vs ve is ie vs > is-1 vs > is+7 vs > ie-2 vs > ie+3 vs == is vs == is+2 | |
# (>=5)* >=vs <=1 1 true ? true true* false false | |
# | |
# Legend: vs: $vf_start, ve: $vf_end, is: $intron_start, ie: $intron_end | |
# A true* that data at the right is selected specifically to make this true. | |
sub intron_overlap_faster { | |
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_; | |
if ($vf_end < $intron_start-3) { | |
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ); | |
} | |
if ($vf_start > $intron_end+3) { | |
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7); | |
} | |
# assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3 | |
if ($vf_start > $intron_start+7) { | |
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2); | |
} | |
# assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7] | |
if ($vf_end < $intron_end-7) { | |
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7); | |
} | |
return ($vf_start <= $intron_start-1) or | |
($vf_end >= $intron_start+2) or | |
($vf_start <= $intron_end-2) or | |
($vf_end >= $intron_end+1) or | |
($insertion && ( | |
$vf_start == $intron_start || | |
$vf_end == $intron_end || | |
$vf_start == $intron_start+2 || | |
$vf_end == $intron_end-2 | |
)); | |
} | |
foreach (0..10000) { | |
push @a, [int(rand(200)), int(200+rand(1000)), int(rand(1000)), int(1000+rand(1000))]; | |
push @a, [int(rand(1000)), int(1000+rand(1000)), int(rand(200)), int(200+rand(1000))]; | |
push @insert, int(rand(2)); | |
} | |
cmpthese($iterations, { | |
'overlap' => sub { | |
foreach my $tup (@a) { | |
overlap(@{$tup}); | |
}}, | |
'overlap_C' => sub { | |
foreach my $tup (@a) { | |
overlap_C(@{$tup}); | |
}}, | |
}); | |
cmpthese($iterations, { | |
'intron O' => sub { | |
my $i=0; | |
foreach my $tup (@a) { | |
intron_overlap(@{$tup}, $insert[$i++]); | |
}}, | |
'intron O faster' => sub { | |
my $i=0; | |
foreach my $tup (@a) { | |
intron_overlap_faster(@{$tup}, $insert[$i++]); | |
}}, | |
'intron O C' => sub { | |
my $i=0; | |
foreach my $tup (@a) { | |
intron_overlap_C(@{$tup}, $insert[$i++]); | |
}}, | |
}); | |
__END__ | |
__C__ | |
/* From Bio::EnsEMBL::Variation::Utils::VariationEffect::overlap */ | |
int overlap_C(int f1_start, int f1_end, int f2_start, int f2_end) { | |
return ( (f1_end >= f2_start) && (f1_start <= f2_end) ); | |
} | |
inline int overlap_C2(int f1_start, int f1_end, int f2_start, int f2_end) { | |
return ( (f1_end >= f2_start) && (f1_start <= f2_end) ); | |
} | |
/* | |
C replacement for intron_overlap. See comments in that subroutine. | |
It might be possible to eak out a little more performace by | |
saving differences and then comparing against a constant. | |
For example: | |
int ieMvs = intron_end - vf_start; | |
if (ieMvs < 3) ... | |
... | |
if (2 > ieMvs) .. // in overlap | |
We'll leave that out for now. | |
*/ | |
bool intron_overlap_C(int vf_start, int vf_end, int intron_start, int intron_end, | |
bool insertion) { | |
if (vf_end < intron_start-3) { | |
return overlap_C2(vf_start, vf_end, intron_end-7, intron_end-2); | |
} | |
if (vf_start > intron_end+3) { | |
return overlap_C2(vf_start, vf_end, intron_start+2, intron_start+7); | |
} | |
// assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3 | |
// Can reduce cost by computing z = vf_start - intron_end and | |
// comparing to z. | |
if (vf_start > intron_start+7) { | |
return overlap_C2(vf_start, vf_end, intron_end-7, intron_end-2); | |
} | |
// assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7] | |
if (vf_end < intron_end-7) { | |
return overlap_C2(vf_start, vf_end, intron_start+2, intron_start+7); | |
} | |
return (vf_start <= intron_start-1) || | |
(vf_end >= intron_start+2) || | |
(vf_start <= intron_end-2) || | |
(vf_end >= intron_end+1) || | |
(insertion && ( | |
vf_start == intron_start || | |
vf_end == intron_end || | |
vf_start == intron_start+2 || | |
vf_end == intron_end-2 | |
)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment