Skip to content

Instantly share code, notes, and snippets.

@vi vi/genetic_regex.pl
Created Nov 24, 2014

Embed
What would you like to do?
Generate regular expressions using genetic algorithm.
#!/usr/bin/perl -w
use strict;
$|=1;
if ($#ARGV != 3) {
print STDERR "Usage: genetic_regex.pl file_with_passing_lines file_with_failing_lines {file_with_hints|''} {file_with_bans|''} \n";
print STDERR "Created by Vitaly '_Vi' Shukela, in 2014. License is LGPLv2+.\n";
exit 1;
}
our @pass = ();
our @fail = ();
our @hints = ();
our %bans = ();
open F, "<", $ARGV[0];
while(<F>) { chomp; push @pass, $_; }
close F;
open F, "<", $ARGV[1];
while(<F>) { chomp; push @fail, $_; }
close F;
if ($ARGV[2]) {
open F, "<", $ARGV[2];
while(<F>) { chomp; push @hints, $_; }
close F;
}
if ($ARGV[3]) {
open F, "<", $ARGV[3];
while(<F>) { chomp; $bans{$_}=1; }
close F;
}
my $sizefactor = $ENV{'SFACTOR'} || 0.06;
my $iterlimit = $ENV{'ITERLIMIT'} || 10000;
my $mapmax = $ENV{'MAPMAX'} || 1000000;
my $candmax = $ENV{'CANDMAX'} || 100000;
my $preservern = $ENV{'PRESERVERN'} || 10000;
my $newspecs = $ENV{'NEWSPECS'} || 20000;
$SIG{ ALRM } = sub { };
our %marks = ();
our %ancestorship = ();
our @candidates = ();
sub getscore($) {
my ($a) = @_;
eval {
no warnings;
local $SIG{ ALRM } = sub { die "TIMEOUT" };
alarm 1;
my $l = length $a;
my $re = qr/$a/;
my $score = 0;
foreach my $x (@pass) {
if ($x =~ $re) {
} else {
$score+=1;
}
}
foreach my $x (@fail) {
if ($x =~ $re) {
$score+=1;
} else {
}
}
$score = $score + $sizefactor*(length $a);
$score += rand ($sizefactor * 0.5);
return $score;
} or return 10000;
}
our @chars = qw@^ . [ | \\ $ ? ) ( } ! { b d w D W s S 1 2 3 4 5 6 7 8 9 * , = < + ~ - ]@;
# Initial filling of candidates: all pass values, all fail values, pre-defined chars
my @defaultcandidates = (@pass, @fail, @chars, @hints);
@candidates = @defaultcandidates;
our $record = 999;
our $boredom = 0;
$SIG{'USR1'} = sub {
eval {
my $c = $candidates[0];
printf STDOUT "Current best variant l=%d m=%.6g\n", length($c), $marks{$c};
printf STDOUT "%s\n", $c;
my %seen = ();
while (defined($c) && !$seen{$c}) {
$c = $ancestorship{$c};
last unless defined($c);
printf STDOUT " %s\n", $c if !defined($seen{$c});
$seen{$c}=1;
}
}
};
our $bancurlead_rq = 0;
sub bancurrentleader() {
my $b = $candidates[0];
my $bmark = $marks{$b};
printf STDOUT "Banning current leader: l=%d m=%.6g\n", length($b), $bmark;
printf STDOUT "%s\n", $b;
my $counter = 0;
foreach my $i (0..$#candidates) {
my $c = $candidates[$i];
my %seen = ();
while (defined($c) && !$seen{$c}) {
if ($marks{$c} <= $bmark + 2) {
$bans{$c} = 1;
#printf STDOUT " b: %s\n", $c;
++$counter;
}
$c = $ancestorship{$c};
last unless defined($c);
$seen{$c}=1;
}
}
printf STDOUT "Banned %d entries\n", $counter;
print "Restarting\n";
%ancestorship = ();
@candidates = @defaultcandidates;
#$record = 999;
%marks = ();
$bancurlead_rq = 0;
};
$SIG{'USR2'} = sub { $bancurlead_rq = 1; };
foreach my $iter (0..$iterlimit) {
# Trim marks if there are too much of them
if (scalar keys %marks > $mapmax) {
print STDERR " Trimming maps\n";
%marks = ();
}
# Calculate marks
foreach my $c (@candidates) {
if (exists($marks{$c})) {
# Mark is already calculated
} else {
my $s = getscore($c);
$marks{$c} = $s;
if ($s < $record - $sizefactor*0.2) {
printf STDOUT "R! %g l=%d %s\n", $s, length($c), $c;
$record = $s;
}
}
}
if ($bancurlead_rq) {
bancurrentleader();
next;
}
# Sort
@candidates = sort { $marks{$a} <=> $marks{$b} } @candidates;
#foreach my $c (@candidates) {
# print STDOUT "Q $marks{$c} $c\n";
#}
#foreach my $i (0..5) {
# my $c = $candidates[$i];
# print STDERR "$marks{$c} $c ";
#}
#print STDERR "\n";
printf STDOUT " %d %d\n", $iter, $#candidates+1;
sub uniq { my %seen; grep !$seen{$_}++, @_ }
our $prevmetric;
# Trim
if ($#candidates > $candmax ) {
printf STDOUT " ancsize=%d\n", scalar (keys %ancestorship);
foreach my $i ($preservern..$#candidates) {
my $c = $candidates[$i];
delete $ancestorship{$c};
}
splice @candidates, $preservern;
my $metric = 0;
my $dmetric = "";
foreach my $i (0..1000) {
$metric+=$marks{$candidates[$i]};
}
if (defined $prevmetric) {
my $d = $prevmetric - $metric;
$dmetric = "(".$d.")";
if ($d >= 0 && $d<3) {
++$boredom;
if ($boredom == 3) {
print "Boring...\n";
$bancurlead_rq = 1;
$boredom = 0;
}
} else {
$boredom = 0;
}
}
$prevmetric = $metric;
printf STDOUT " TRIM complete, n=%d, metric=%.7g %s ancsize=%d\n", $#candidates+1, $metric, $dmetric, scalar (keys %ancestorship);
}
# Add more specimens
foreach my $i (0..$newspecs) {
my $op = int (rand 22);
my $ri = (rand 3)*(rand 1);
my $r;
if ($ri <= 1) {
$r = int ($ri*($#candidates + 1));
} elsif ($ri <= 2) {
$r = 0;
} else {
$r = $#candidates;
}
my $newc = $candidates[$r];
next if $newc eq "";
if ($op == 0) {
# mutate 1 character
my $p = int (rand (length $newc));
(substr $newc, $p, 1) = $chars[int(rand($#chars+1))];
} elsif ($op == 1) {
# add one character
my $p = int (rand ((length $newc) + 1));
(substr $newc, $p, 0) = $chars[int(rand($#chars+1))];
} elsif ($op >=2 && $op <= 11) {
my $p = int (rand (length $newc));
my $l = int (rand ((length $newc)-$p-2))+1;
my $b = (substr $newc, 0, $p);
my $m = (substr $newc, $p, $l);
my $a = (substr $newc, $p+$l);
if ($op == 2) {
# group substring 1
$newc = "$b($m)$a";
} elsif ($op == 3) {
# group substring 2
$newc = "$b\[$m\]$a";
} elsif ($op == 4) {
# group substring 3
$newc = "$b\{$m\}$a";
} elsif ($op == 5) {
# group substring 4
$newc = "$b(?=$m)$a";
} elsif ($op == 6) {
# reverse substring
$newc = $b.(reverse $m).$a;
} elsif ($op >= 7 && $op<=10) {
# remove substring
$newc = $b.$a;
} elsif ($op == 11) {
# duplicate substring
$newc = "$b$m$m$a";
}
} elsif ($op == 12) {
# concatenate two candidates
my $r2 = int (rand ($#candidates + 1));
$newc = $newc . $candidates[$r2];
} elsif ($op == 13) {
# concatenate two candidates with |
my $r2 = int (rand ($#candidates + 1));
$newc = $newc . "|" . $candidates[$r2];
} elsif ($op == 14) {
# concatenate two candidates with (?=)
my $r2 = int (rand ($#candidates + 1));
$newc = "(?= " . $newc . ")" . $candidates[$r2];
} elsif ($op == 15) {
# portmaneu
my $r2 = int (rand ($#candidates + 1));
my $c2 = $candidates[$r2];
my $p = int (rand ((length $newc)-1));
my $p2 = int (rand ((length $c2)-1));
$newc = (substr $newc,0,$p) . (substr $c2,$p2);
} elsif ($op == 16) {
# extract substring
my $p = int (rand ((length $newc)-1));
my $l = int (rand ((length $newc)-$p-1));
$newc = substr $newc, $p, $l;
} elsif ($op >= 17 && $op <= 21) {
# transplant substring
my $r2 = int (rand ($#candidates + 1));
my $c2 = $candidates[$r2];
my $p2 = int (rand (length $c2));
my $l2 = int (rand ((length $c2)-$p2-2))+1;
my $m = (substr $c2, $p2, $l2);
my $p = int (rand ((length $newc)-1));
my $l = int (rand ((length $newc)-$p-1));
(substr $newc, $p, $l) = $m;
}
next if exists $marks{$newc};
if (exists $bans{$newc}) {
my $c = $candidates[$r];
$marks{$c}=1000;
$candidates[$r]='';
#printf STDERR " Ban %s ``%s``\n", $newc, $c;
next;
}
push @candidates, $newc;
$ancestorship{$newc} = $candidates[$r];
}
}
#foreach my $c (@candidates) {
foreach my $i (0..$#candidates) {
next unless $i % 1000 == 0 or $i < 10;
my $c = $candidates[$i];
next unless exists $marks{$c};
print STDOUT "[$i] $marks{$c} $c\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.