Skip to content

Instantly share code, notes, and snippets.

@rocky
Last active August 29, 2015 14:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rocky/6083dedc752c197875ca to your computer and use it in GitHub Desktop.
Save rocky/6083dedc752c197875ca to your computer and use it in GitHub Desktop.
VEP speedup testing
perl overlap-bench.pl
Rate overlap overlap_C
overlap 136/s -- -49%
overlap_C 267/s 96% --
Rate intron O intron O faster intron O C
intron O 27.7/s -- -42% -76%
intron O faster 47.5/s 72% -- -58%
intron O C 114/s 311% 139% --
sub overlap {
my ($f1_start, $f1_end, $f2_start, $f2_end) = @_;
return ( ($f1_end >= $f2_start) and ($f1_start <= $f2_end) );
}
sub intron {
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_;
if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or
overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or
overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or
overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or
($insertion && (
$vf_start == $intron_start ||
$vf_end == $intron_end ||
$vf_start == $intron_start+2 ||
$vf_end == $intron_end-2
) )) {
return 1;
};
};
sub intron_faster {
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_;
# Can reduce cost by computing x = vf_end - intron_end and
# comparing to x.
if ($vf_end < $intron_start-3) {
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 );
}
# Can reduce cost by computing y = vf_start - intron_end and
# comparing to y.
if ($vf_start > $intron_end+3) {
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7);
}
# assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3
# Can reduce cost by computing z = vf_start - intron_end and
# comparing to z.
if ($vf_start > $intron_start+7) {
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2);
}
# assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7]
if ($vf_end < $intron_end-7) {
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7);
}
return ($vf_start <= $intron_start-1) or
($vf_end >= $intron_start+2) or
($vf_start <= $intron_end-2) or
($vf_end >= $intron_end+1) or
($insertion && (
$vf_start == $intron_start ||
$vf_end == $intron_end ||
$vf_start == $intron_start+2 ||
$vf_end == $intron_end-2
);
}
#!/usr/bin/perl
use Benchmark qw(:all);
use Inline C;
# my $Iterations = 100_000;
my $iterations = 500;
# From Bio::EnsEMBL::Variation::Utils::VariationEffect::overlap
sub overlap {
my ( $f1_start, $f1_end, $f2_start, $f2_end ) = @_;
return ( ($f1_end >= $f2_start) and ($f1_start <= $f2_end) );
}
# From inside Bio::EnsEMBL::Variation::BaseTranscriptVariation::_intron_effects
sub intron_overlap {
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_;
if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or
overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or
overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or
overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or
($insertion && (
$vf_start == $intron_start ||
$vf_end == $intron_end ||
$vf_start == $intron_start+2 ||
$vf_end == $intron_end-2
) )) {
return 1;
};
};
# Speed up intron_overlap. We do this by unrolling the two parts of the interval test
# in overlap and use knowlege of parts of the compare in subsequent comparisons.
#
# Below is a table to show how comparison results are be reused
# vs ve is ie ve < is-3 ve < is+2 ve < ie-7 ve < ie+1 vs == is vs == is+2
# <=1 1 (>=4)* >=is true* true false,true if ie>7 true false false
##
# vs ve is ie vs > is-1 vs > is+7 vs > ie-2 vs > ie+3 vs == is vs == is+2
# (>=5)* >=vs <=1 1 true ? true true* false false
#
# Legend: vs: $vf_start, ve: $vf_end, is: $intron_start, ie: $intron_end
# A true* that data at the right is selected specifically to make this true.
sub intron_overlap_faster {
my ($vf_start, $vf_end, $intron_start, $intron_end, $insertion) = @_;
if ($vf_end < $intron_start-3) {
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 );
}
if ($vf_start > $intron_end+3) {
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7);
}
# assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3
if ($vf_start > $intron_start+7) {
return overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2);
}
# assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7]
if ($vf_end < $intron_end-7) {
return overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7);
}
return ($vf_start <= $intron_start-1) or
($vf_end >= $intron_start+2) or
($vf_start <= $intron_end-2) or
($vf_end >= $intron_end+1) or
($insertion && (
$vf_start == $intron_start ||
$vf_end == $intron_end ||
$vf_start == $intron_start+2 ||
$vf_end == $intron_end-2
));
}
foreach (0..10000) {
push @a, [int(rand(200)), int(200+rand(1000)), int(rand(1000)), int(1000+rand(1000))];
push @a, [int(rand(1000)), int(1000+rand(1000)), int(rand(200)), int(200+rand(1000))];
push @insert, int(rand(2));
}
cmpthese($iterations, {
'overlap' => sub {
foreach my $tup (@a) {
overlap(@{$tup});
}},
'overlap_C' => sub {
foreach my $tup (@a) {
overlap_C(@{$tup});
}},
});
cmpthese($iterations, {
'intron O' => sub {
my $i=0;
foreach my $tup (@a) {
intron_overlap(@{$tup}, $insert[$i++]);
}},
'intron O faster' => sub {
my $i=0;
foreach my $tup (@a) {
intron_overlap_faster(@{$tup}, $insert[$i++]);
}},
'intron O C' => sub {
my $i=0;
foreach my $tup (@a) {
intron_overlap_C(@{$tup}, $insert[$i++]);
}},
});
__END__
__C__
/* From Bio::EnsEMBL::Variation::Utils::VariationEffect::overlap */
int overlap_C(int f1_start, int f1_end, int f2_start, int f2_end) {
return ( (f1_end >= f2_start) && (f1_start <= f2_end) );
}
inline int overlap_C2(int f1_start, int f1_end, int f2_start, int f2_end) {
return ( (f1_end >= f2_start) && (f1_start <= f2_end) );
}
/*
C replacement for intron_overlap. See comments in that subroutine.
It might be possible to eak out a little more performace by
saving differences and then comparing against a constant.
For example:
int ieMvs = intron_end - vf_start;
if (ieMvs < 3) ...
...
if (2 > ieMvs) .. // in overlap
We'll leave that out for now.
*/
bool intron_overlap_C(int vf_start, int vf_end, int intron_start, int intron_end,
bool insertion) {
if (vf_end < intron_start-3) {
return overlap_C2(vf_start, vf_end, intron_end-7, intron_end-2);
}
if (vf_start > intron_end+3) {
return overlap_C2(vf_start, vf_end, intron_start+2, intron_start+7);
}
// assert vf_end >= $intron_start-3 and vf_start >= $intron_end+3
// Can reduce cost by computing z = vf_start - intron_end and
// comparing to z.
if (vf_start > intron_start+7) {
return overlap_C2(vf_start, vf_end, intron_end-7, intron_end-2);
}
// assert vf_end >= $intron_start-3 and vf_start in [$intron_end+3..$intron_start+7]
if (vf_end < intron_end-7) {
return overlap_C2(vf_start, vf_end, intron_start+2, intron_start+7);
}
return (vf_start <= intron_start-1) ||
(vf_end >= intron_start+2) ||
(vf_start <= intron_end-2) ||
(vf_end >= intron_end+1) ||
(insertion && (
vf_start == intron_start ||
vf_end == intron_end ||
vf_start == intron_start+2 ||
vf_end == intron_end-2
));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment