Created
November 24, 2014 11:42
-
-
Save vi/ed0f1f6bf8b6ed9f5ff1 to your computer and use it in GitHub Desktop.
Generate regular expressions using genetic algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
$|=1; | |
if ($#ARGV != 3) { | |
print STDERR "Usage: genetic_regex.pl file_with_passing_lines file_with_failing_lines {file_with_hints|''} {file_with_bans|''} \n"; | |
print STDERR "Created by Vitaly '_Vi' Shukela, in 2014. License is LGPLv2+.\n"; | |
exit 1; | |
} | |
our @pass = (); | |
our @fail = (); | |
our @hints = (); | |
our %bans = (); | |
open F, "<", $ARGV[0]; | |
while(<F>) { chomp; push @pass, $_; } | |
close F; | |
open F, "<", $ARGV[1]; | |
while(<F>) { chomp; push @fail, $_; } | |
close F; | |
if ($ARGV[2]) { | |
open F, "<", $ARGV[2]; | |
while(<F>) { chomp; push @hints, $_; } | |
close F; | |
} | |
if ($ARGV[3]) { | |
open F, "<", $ARGV[3]; | |
while(<F>) { chomp; $bans{$_}=1; } | |
close F; | |
} | |
my $sizefactor = $ENV{'SFACTOR'} || 0.06; | |
my $iterlimit = $ENV{'ITERLIMIT'} || 10000; | |
my $mapmax = $ENV{'MAPMAX'} || 1000000; | |
my $candmax = $ENV{'CANDMAX'} || 100000; | |
my $preservern = $ENV{'PRESERVERN'} || 10000; | |
my $newspecs = $ENV{'NEWSPECS'} || 20000; | |
$SIG{ ALRM } = sub { }; | |
our %marks = (); | |
our %ancestorship = (); | |
our @candidates = (); | |
sub getscore($) { | |
my ($a) = @_; | |
eval { | |
no warnings; | |
local $SIG{ ALRM } = sub { die "TIMEOUT" }; | |
alarm 1; | |
my $l = length $a; | |
my $re = qr/$a/; | |
my $score = 0; | |
foreach my $x (@pass) { | |
if ($x =~ $re) { | |
} else { | |
$score+=1; | |
} | |
} | |
foreach my $x (@fail) { | |
if ($x =~ $re) { | |
$score+=1; | |
} else { | |
} | |
} | |
$score = $score + $sizefactor*(length $a); | |
$score += rand ($sizefactor * 0.5); | |
return $score; | |
} or return 10000; | |
} | |
our @chars = qw@^ . [ | \\ $ ? ) ( } ! { b d w D W s S 1 2 3 4 5 6 7 8 9 * , = < + ~ - ]@; | |
# Initial filling of candidates: all pass values, all fail values, pre-defined chars | |
my @defaultcandidates = (@pass, @fail, @chars, @hints); | |
@candidates = @defaultcandidates; | |
our $record = 999; | |
our $boredom = 0; | |
$SIG{'USR1'} = sub { | |
eval { | |
my $c = $candidates[0]; | |
printf STDOUT "Current best variant l=%d m=%.6g\n", length($c), $marks{$c}; | |
printf STDOUT "%s\n", $c; | |
my %seen = (); | |
while (defined($c) && !$seen{$c}) { | |
$c = $ancestorship{$c}; | |
last unless defined($c); | |
printf STDOUT " %s\n", $c if !defined($seen{$c}); | |
$seen{$c}=1; | |
} | |
} | |
}; | |
our $bancurlead_rq = 0; | |
sub bancurrentleader() { | |
my $b = $candidates[0]; | |
my $bmark = $marks{$b}; | |
printf STDOUT "Banning current leader: l=%d m=%.6g\n", length($b), $bmark; | |
printf STDOUT "%s\n", $b; | |
my $counter = 0; | |
foreach my $i (0..$#candidates) { | |
my $c = $candidates[$i]; | |
my %seen = (); | |
while (defined($c) && !$seen{$c}) { | |
if ($marks{$c} <= $bmark + 2) { | |
$bans{$c} = 1; | |
#printf STDOUT " b: %s\n", $c; | |
++$counter; | |
} | |
$c = $ancestorship{$c}; | |
last unless defined($c); | |
$seen{$c}=1; | |
} | |
} | |
printf STDOUT "Banned %d entries\n", $counter; | |
print "Restarting\n"; | |
%ancestorship = (); | |
@candidates = @defaultcandidates; | |
#$record = 999; | |
%marks = (); | |
$bancurlead_rq = 0; | |
}; | |
$SIG{'USR2'} = sub { $bancurlead_rq = 1; }; | |
foreach my $iter (0..$iterlimit) { | |
# Trim marks if there are too much of them | |
if (scalar keys %marks > $mapmax) { | |
print STDERR " Trimming maps\n"; | |
%marks = (); | |
} | |
# Calculate marks | |
foreach my $c (@candidates) { | |
if (exists($marks{$c})) { | |
# Mark is already calculated | |
} else { | |
my $s = getscore($c); | |
$marks{$c} = $s; | |
if ($s < $record - $sizefactor*0.2) { | |
printf STDOUT "R! %g l=%d %s\n", $s, length($c), $c; | |
$record = $s; | |
} | |
} | |
} | |
if ($bancurlead_rq) { | |
bancurrentleader(); | |
next; | |
} | |
# Sort | |
@candidates = sort { $marks{$a} <=> $marks{$b} } @candidates; | |
#foreach my $c (@candidates) { | |
# print STDOUT "Q $marks{$c} $c\n"; | |
#} | |
#foreach my $i (0..5) { | |
# my $c = $candidates[$i]; | |
# print STDERR "$marks{$c} $c "; | |
#} | |
#print STDERR "\n"; | |
printf STDOUT " %d %d\n", $iter, $#candidates+1; | |
sub uniq { my %seen; grep !$seen{$_}++, @_ } | |
our $prevmetric; | |
# Trim | |
if ($#candidates > $candmax ) { | |
printf STDOUT " ancsize=%d\n", scalar (keys %ancestorship); | |
foreach my $i ($preservern..$#candidates) { | |
my $c = $candidates[$i]; | |
delete $ancestorship{$c}; | |
} | |
splice @candidates, $preservern; | |
my $metric = 0; | |
my $dmetric = ""; | |
foreach my $i (0..1000) { | |
$metric+=$marks{$candidates[$i]}; | |
} | |
if (defined $prevmetric) { | |
my $d = $prevmetric - $metric; | |
$dmetric = "(".$d.")"; | |
if ($d >= 0 && $d<3) { | |
++$boredom; | |
if ($boredom == 3) { | |
print "Boring...\n"; | |
$bancurlead_rq = 1; | |
$boredom = 0; | |
} | |
} else { | |
$boredom = 0; | |
} | |
} | |
$prevmetric = $metric; | |
printf STDOUT " TRIM complete, n=%d, metric=%.7g %s ancsize=%d\n", $#candidates+1, $metric, $dmetric, scalar (keys %ancestorship); | |
} | |
# Add more specimens | |
foreach my $i (0..$newspecs) { | |
my $op = int (rand 22); | |
my $ri = (rand 3)*(rand 1); | |
my $r; | |
if ($ri <= 1) { | |
$r = int ($ri*($#candidates + 1)); | |
} elsif ($ri <= 2) { | |
$r = 0; | |
} else { | |
$r = $#candidates; | |
} | |
my $newc = $candidates[$r]; | |
next if $newc eq ""; | |
if ($op == 0) { | |
# mutate 1 character | |
my $p = int (rand (length $newc)); | |
(substr $newc, $p, 1) = $chars[int(rand($#chars+1))]; | |
} elsif ($op == 1) { | |
# add one character | |
my $p = int (rand ((length $newc) + 1)); | |
(substr $newc, $p, 0) = $chars[int(rand($#chars+1))]; | |
} elsif ($op >=2 && $op <= 11) { | |
my $p = int (rand (length $newc)); | |
my $l = int (rand ((length $newc)-$p-2))+1; | |
my $b = (substr $newc, 0, $p); | |
my $m = (substr $newc, $p, $l); | |
my $a = (substr $newc, $p+$l); | |
if ($op == 2) { | |
# group substring 1 | |
$newc = "$b($m)$a"; | |
} elsif ($op == 3) { | |
# group substring 2 | |
$newc = "$b\[$m\]$a"; | |
} elsif ($op == 4) { | |
# group substring 3 | |
$newc = "$b\{$m\}$a"; | |
} elsif ($op == 5) { | |
# group substring 4 | |
$newc = "$b(?=$m)$a"; | |
} elsif ($op == 6) { | |
# reverse substring | |
$newc = $b.(reverse $m).$a; | |
} elsif ($op >= 7 && $op<=10) { | |
# remove substring | |
$newc = $b.$a; | |
} elsif ($op == 11) { | |
# duplicate substring | |
$newc = "$b$m$m$a"; | |
} | |
} elsif ($op == 12) { | |
# concatenate two candidates | |
my $r2 = int (rand ($#candidates + 1)); | |
$newc = $newc . $candidates[$r2]; | |
} elsif ($op == 13) { | |
# concatenate two candidates with | | |
my $r2 = int (rand ($#candidates + 1)); | |
$newc = $newc . "|" . $candidates[$r2]; | |
} elsif ($op == 14) { | |
# concatenate two candidates with (?=) | |
my $r2 = int (rand ($#candidates + 1)); | |
$newc = "(?= " . $newc . ")" . $candidates[$r2]; | |
} elsif ($op == 15) { | |
# portmaneu | |
my $r2 = int (rand ($#candidates + 1)); | |
my $c2 = $candidates[$r2]; | |
my $p = int (rand ((length $newc)-1)); | |
my $p2 = int (rand ((length $c2)-1)); | |
$newc = (substr $newc,0,$p) . (substr $c2,$p2); | |
} elsif ($op == 16) { | |
# extract substring | |
my $p = int (rand ((length $newc)-1)); | |
my $l = int (rand ((length $newc)-$p-1)); | |
$newc = substr $newc, $p, $l; | |
} elsif ($op >= 17 && $op <= 21) { | |
# transplant substring | |
my $r2 = int (rand ($#candidates + 1)); | |
my $c2 = $candidates[$r2]; | |
my $p2 = int (rand (length $c2)); | |
my $l2 = int (rand ((length $c2)-$p2-2))+1; | |
my $m = (substr $c2, $p2, $l2); | |
my $p = int (rand ((length $newc)-1)); | |
my $l = int (rand ((length $newc)-$p-1)); | |
(substr $newc, $p, $l) = $m; | |
} | |
next if exists $marks{$newc}; | |
if (exists $bans{$newc}) { | |
my $c = $candidates[$r]; | |
$marks{$c}=1000; | |
$candidates[$r]=''; | |
#printf STDERR " Ban %s ``%s``\n", $newc, $c; | |
next; | |
} | |
push @candidates, $newc; | |
$ancestorship{$newc} = $candidates[$r]; | |
} | |
} | |
#foreach my $c (@candidates) { | |
foreach my $i (0..$#candidates) { | |
next unless $i % 1000 == 0 or $i < 10; | |
my $c = $candidates[$i]; | |
next unless exists $marks{$c}; | |
print STDOUT "[$i] $marks{$c} $c\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment