Skip to content

Instantly share code, notes, and snippets.

@Metaxal
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Metaxal/777be153b50a35a2618c to your computer and use it in GitHub Desktop.
Save Metaxal/777be153b50a35a2618c to your computer and use it in GitHub Desktop.
regex-golf slowdown?
#!/usr/bin/python
# From: http://nbviewer.ipython.org/url/norvig.com/ipython/xkcd1313.ipynb
import re
def verify(regex, winners, losers):
"Return true iff the regex matches all winners but no losers."
missed_winners = {W for W in winners if not re.search(regex, W)}
matched_losers = {L for L in losers if re.search(regex, L)}
if missed_winners:
print "Error: should match but did not:", ', '.join(missed_winners)
if matched_losers:
print "Error: should not match but did:", ', '.join(matched_losers)
return not (missed_winners or matched_losers)
def findregex(winners, losers):
"Find a regex that matches all winners but no losers (sets of strings)."
# Make a pool of regex components, then pick from them to cover winners.
# On each iteration, add the 'best' component to 'solution',
# remove winners covered by best, and keep in 'pool' only components
# that still match some winner.
pool = regex_components(winners, losers)
solution = []
def score(r): return 4 * len(matches(r, winners)) - len(r)
while winners:
best = max(pool, key=score)
solution.append(best)
winners = winners - matches(best, winners)
pool = {r for r in pool if matches(r, winners)}
return OR(solution)
def matches(regex, strings):
"Return a set of all the strings that are matched by regex."
return {s for s in strings if re.search(regex, s)}
OR = '|'.join # Join a sequence of strings with '|' between them
def regex_components(winners, losers):
"Return components that match at least one winner, but no loser."
wholes = {'^'+winner+'$' for winner in winners}
parts = {d for w in wholes for s in subparts(w) for d in dotify(s)}
return wholes | {p for p in parts if not matches(p, losers)}
def subparts(word):
"Return a set of subparts of word, consecutive characters up to length 4."
return set(word[i:i+n] for i in range(len(word)) for n in (1, 2, 3, 4))
def dotify(part):
"Return all ways to replace a subset of chars in part with '.'."
if part == '':
return {''}
else:
return {c+rest for rest in dotify(part[1:])
for c in replacements(part[0])}
def replacements(char):
"Return replacement characters for char (char + '.' unless char is special)."
if (char == '^' or char == '$'):
return char
else:
return char + '.'
def words(text): return set(text.lower().split())
def lines(text): return {line.upper().strip() for line in text.split('/')}
################ Data
winners = words('''washington adams jefferson jefferson madison madison monroe
monroe adams jackson jackson van-buren harrison polk taylor pierce buchanan
lincoln lincoln grant grant hayes garfield cleveland harrison cleveland mckinley
mckinley roosevelt taft wilson wilson harding coolidge hoover roosevelt
roosevelt roosevelt roosevelt truman eisenhower eisenhower kennedy johnson nixon
nixon carter reagan reagan bush clinton clinton bush bush obama obama''')
losers = words('''clinton jefferson adams pinckney pinckney clinton king adams
jackson adams clay van-buren van-buren clay cass scott fremont breckinridge
mcclellan seymour greeley tilden hancock blaine cleveland harrison bryan bryan
parker bryan roosevelt hughes cox davis smith hoover landon wilkie dewey dewey
stevenson stevenson nixon goldwater humphrey mcgovern ford carter mondale
dukakis bush dole gore kerry mccain romney''')
losers = losers - winners
starwars = lines('''The Phantom Menace / Attack of the Clones / Revenge of the Sith /
A New Hope / The Empire Strikes Back / Return of the Jedi''')
startrek = lines('''The Wrath of Khan / The Search for Spock / The Voyage Home /
The Final Frontier / The Undiscovered Country / Generations / First Contact /
Insurrection / Nemesis''')
def findboth(A, B):
"Find a regex to match A but not B, and vice-versa. Print summary."
for (W, L, legend) in [(A, B, 'A-B'), (B, A, 'B-A')]:
solution = findregex(W, L)
assert verify(solution, W, L)
ratio = len('^(' + OR(W) + ')$') / float(len(solution))
print '%3d chars, %4.1f ratio, %2d winners %s: %r' % (
len(solution), ratio , len(W), legend, solution)
################ Tests
def test1():
assert subparts('^it$') == {'^', 'i', 't', '$', '^i', 'it', 't$', '^it', 'it$', '^it$'}
assert subparts('this') == {'t', 'h', 'i', 's', 'th', 'hi', 'is', 'thi', 'his', 'this'}
assert dotify('it') == {'it', 'i.', '.t', '..'}
assert dotify('^it$') == {'^it$', '^i.$', '^.t$', '^..$'}
assert dotify('this') == {'this', 'thi.', 'th.s', 'th..', 't.is', 't.i.', 't..s', 't...',
'.his', '.hi.', '.h.s', '.h..', '..is', '..i.', '...s', '....'}
assert replacements('x') == 'x.'
assert replacements('^') == '^'
assert replacements('$') == '$'
assert regex_components({'win'}, {'losers', 'bin', 'won'}) == {
'^win$', '^win', '^wi.', 'wi.', 'wi', '^wi', 'win$', 'win', 'wi.$'}
assert regex_components({'win'}, {'losers', 'bin', 'won', 'wine'}) == {
'^win$', 'win$', 'wi.$'}
assert matches('a|b|c', {'a', 'b', 'c', 'd', 'e'}) == {'a', 'b', 'c'}
assert matches('a|b|c', {'any', 'bee', 'succeed', 'dee', 'eee!'}) == {
'any', 'bee', 'succeed'}
assert verify('a+b+', {'ab', 'aaabb'}, {'a', 'bee', 'a b'})
assert findregex({"ahahah", "ciao"}, {"ahaha", "bye"}) == 'a.$'
assert OR(['a', 'b', 'c']) == 'a|b|c'
assert OR(['a']) == 'a'
assert words('This is a TEST this is') == {'this', 'is', 'a', 'test'}
assert lines('Testing / 1 2 3 / Testing over') == {'TESTING', '1 2 3', 'TESTING OVER'}
return 'test1 passes'
if __name__ == '__main__':
print test1()
findboth(winners, losers)
# python regex-golf.py 1,95s user 0,01s system 99% cpu 1,962 total
#lang racket
;;;; Racket translation of: http://nbviewer.ipython.org/url/norvig.com/ipython/xkcd1313.ipynb
(module+ test
(require rackunit))
;; Faster variant of filter that does not keep the initial element order
(define (filter f list)
(let loop ([l list] [result null])
(if (null? l)
result #;(reverse result) ; do not reverse!
(loop (cdr l)
(if (f (car l))
(cons (car l) result)
result)))))
(define winners-syms
(remove-duplicates
'(washington adams jefferson jefferson madison madison monroe
monroe adams jackson jackson van-buren harrison polk taylor pierce buchanan
lincoln lincoln grant grant hayes garfield cleveland harrison cleveland mckinley
mckinley roosevelt taft wilson wilson harding coolidge hoover roosevelt
roosevelt roosevelt roosevelt truman eisenhower eisenhower kennedy johnson nixon
nixon carter reagan reagan bush clinton clinton bush bush obama obama)))
(define losers-syms
(remove-duplicates
'(jefferson adams pinckney pinckney clinton king adams
jackson adams clay van-buren van-buren clay cass scott fremont breckinridge
mcclellan seymour greeley tilden hancock blaine cleveland harrison bryan bryan
parker bryan roosevelt hughes cox davis smith hoover landon wilkie dewey dewey
stevenson stevenson nixon goldwater humphrey mcgovern ford carter mondale
dukakis bush dole gore kerry mccain romney)))
(define winners (map symbol->string winners-syms))
;; Some winners are also losers (sometimes with different first names), so we remove them
(define losers
(map symbol->string (remove* winners-syms losers-syms)))
;; Return true iff the regex matches all winners but no losers.
(define (verify regex winners losers)
(define missed-winners (filter (λ(w)(not (regexp-match regex w)))
winners))
(define matched-losers (filter (λ(l)(regexp-match regex l))
losers))
(unless (empty? missed-winners)
(printf "Error: should match but did not: ~a\n" missed-winners))
(unless (empty? matched-losers)
(printf "Error: should not match but did: ~a\n" matched-losers))
(and (empty? missed-winners) (empty? matched-losers)))
;; Finds a regex that matches all winners but no losers (lists of strings).
;; Make a pool of regex components, then pick from them to cover winners.
;; On each iteration, add the 'best' component to 'solution',
;; remove winners covered by best, and keep in 'pool' only components
;; that still match some winner.
(define (find-regex winners losers)
(define (score r winners)
(- (* 4 (length (matches r winners)))
(string-length r)))
(let loop ([pool (regex-components winners losers)]
[winners winners]
[solution '()])
(if (empty? winners)
(OR solution)
(let* ([best (argmax (λ(r)(score r winners)) pool)]
[winners (remove* (matches best winners) winners)])
(loop (filter (λ(r)(not (empty? (matches r winners))))
pool)
winners
(cons best solution))))))
;; Returns a set of all the strings that are matched by rx
(define (matches rx lstr)
(filter (λ(w)(regexp-match? rx w)) lstr))
;; Joins a sequence of string with "|" betwen them
(define (OR lstr)
(string-join lstr "|"))
;; Return components that match at least one winner, but no loser.
(define (regex-components winners losers)
(define wholes (map (λ(s)(string-append "^" s "$")) winners))
(define parts
(for*/list ([w (in-list wholes)] [s (in-list (subparts w))] [d (in-list (dotify s))])
d))
(remove-duplicates (append wholes (filter (λ(p)(empty? (matches p losers)))
parts))))
;; Returns a set of subparts of word, consecutive characters up to length 4.
(define (subparts word)
(define len (string-length word))
(for*/list ([i (in-range len)] [n (in-range (min 4 (- len i)))])
(substring word i (+ i n 1))))
;; Returns all ways to replace a subset of chars in part with '.'.
(define (dotify part)
(if (string=? part "")
(list "")
(for*/list ([c (in-list (replacements (string-ref part 0)))]
[rest (in-list (dotify (substring part 1)))])
(string-append (string c) rest))))
;; Returns replacement characters for chr (chr + '.' unless chr is special)
(define (replacements chr)
(if (or (char=? chr #\^) (char=? chr #\$))
(list chr)
(list chr #\.)))
(module+ test
(check set=? (subparts "^it$") '("^" "i" "t" "$" "^i" "it" "t$" "^it" "it$" "^it$"))
(check set=? (subparts "this") '("t" "h" "i" "s" "th" "hi" "is" "thi" "his" "this"))
(check set=? (dotify "it") '("it" "i." ".t" ".."))
(check set=? (dotify "^it$") '("^it$" "^i.$" "^.t$" "^..$"))
(check set=? (dotify "this") '("this" "thi." "th.s" "th.." "t.is" "t.i." "t..s" "t..." ".his" ".hi." ".h.s" ".h.." "..is" "..i." "...s" "...."))
(check set=? (replacements #\x) '(#\x #\.))
(check set=? (replacements #\^) '(#\^))
(check set=? (replacements #\$) '(#\$))
(check set=? (regex-components '("win") '("losers" "bin" "won"))
'("^win$" "^win" "^wi." "wi." "wi" "^wi" "win$" "win" "wi.$"))
(check set=? (regex-components '("win") '("losers" "bin" "won" "wine"))
'("^win$" "win$" "wi.$"))
(check set=? (matches "a|b|c" '("a" "b" "c" "d" "e"))
'("a" "b" "c"))
(check set=? (matches "a|b|c" '("any" "bee" "succeed" "dee" "eee!"))
'("any" "bee" "succeed"))
(check-true (verify "a+b+" '("ab" "aaabb") '("a" "bee" "a b")))
(check-equal? (find-regex '("ahahah" "ciao") '("ahaha" "bye")) "a.$" )
(check-equal? (OR '("a" "b" "c")) "a|b|c")
(check-equal? (OR '("a")) "a")
)
;; Finds a regex to match A but not B, and vice-versa. Prints summary.
(define (find-both A B)
(for ([W (in-list (list A B))] [L (in-list (list B A))] [legend (in-list '("A-B" "B-A"))])
(define solution (find-regex W L))
(when (verify solution W L)
(define ratio (exact->inexact (/ (string-length (string-append "^(" (OR W) ")$"))
(string-length solution))))
(printf "~a chars, ~a ratio, ~a winners ~a: ~a\n"
(string-length solution)
(~r ratio)
(length W)
legend
solution))))
(module+ main
(time (find-both winners losers)))
; cpu time: 51633 real time: 51578 gc time: 587
#!/usr/bin/python
import re
import random
lstr_chars = ['a', 'b', 'c', 'd']
lrx_chars = lstr_chars + ['.', '$', '^']
lstr = [ ''.join([random.choice(lstr_chars) for i in range(10)]) for j in range(1000)]
lrx = [ ''.join([random.choice(lrx_chars) for i in range(4)]) for j in range(1000)]
if __name__ == '__main__':
#print lstr
#print lrx
for s in lstr:
for rx in lrx:
re.search(rx, s)
# 25,78s user 0,01s system 100% cpu 25,772 total
#lang racket
(define lstr-chars '(#\a #\b #\c #\d))
(define lrx-chars (list* #\. #\$ #\^ lstr-chars))
(define (choose l)
(list-ref l (random (length l))))
(define lstr (build-list 1000 (λ(i)(build-string 10 (λ(j)(choose lstr-chars))))))
(define lrx (build-list 1000 (λ(i)(build-string 4 (λ(j)(choose lrx-chars))))))
(void
(for*/list ([str lstr]
[rx lrx]
#:when (regexp-match rx str))
str))
; 23,23s user 0,07s system 100% cpu 23,274 total
@Metaxal
Copy link
Author

Metaxal commented Jun 26, 2014

The huge slowdown is due to regexes not being automatically compiled and cached in Racket.

The simple and efficient solution here is to change:

(define (matches rx lstr)
  (filter (λ(w)(regexp-match? rx w)) lstr))

to

(define (matches rx-str lstr)
  (define rx (regexp rx-str))
  (filter (λ(w)(regexp-match? rx w)) lstr))

The time is then that of Python.

Thanks to Matthew Flatt for the quick answer!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment