Skip to content

Instantly share code, notes, and snippets.

@kimmel
Created November 18, 2012 08:27
Embed
What would you like to do?
A benchmark of regex methods
#!/usr/bin/perl
use v5.16;
use warnings;
use autodie qw( :all );
use utf8::all;
use File::Slurp qw( read_file );
use List::MoreUtils qw( uniq );
use Regexp::Assemble;
use Benchmark qw( cmpthese :hireswallclock );
my %file_names = ();
my $fname = 'dracula.txt';
my $content = read_file($fname);
$content =~ tr/!"#$%&'()*+,\-.\/:;<=>?@\[\\]^_`{|}~/ /;
my $pattern_list = do 'from_wp.pl.dict';
my @parts = split /\s+/, lc $content;
@parts = grep { $_ ne q{} } @parts;
@parts = grep { length $_ > 1 } @parts;
my @words = uniq @parts;
my %seen;
my %seen2;
my %seen3;
my @patterns = read_file('sample_patterns');
chomp @patterns;
my $regex = join '|', map {quotemeta} @patterns;
$regex = qr/\b($regex)\b/ixms;
my $regex3 = Regexp::Assemble->new->add(@patterns);
$regex3->anchor_word(1);
$regex3->flags('ixms');
#$regex3->re();
cmpthese(
-7,
{ 'method1' => sub {
foreach my $word (@words) {
my $key = lc substr $word, 0, 1;
$file_names{$fname}->{$word} = 1
if (
grep { $word eq $_ }
keys %{ $pattern_list->{$key} }
);
}
},
'one_regex' => sub {
$seen{$1}++ while $content =~ /$regex/g;
},
'regex_overhead' => sub {
my $regex2 = join '|', map {quotemeta} @patterns;
$regex2 = qr/\b($regex)\b/ixms;
$seen2{$1}++ while $content =~ /$regex2/g;
},
'regex_assem' => sub {
$seen3{$1}++ while $content =~ m/($regex3)/g;
},
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment