Skip to content

Instantly share code, notes, and snippets.

@otherjoel
Last active March 21, 2019 02:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save otherjoel/4366960058983073ce01fa27f1a4d09d to your computer and use it in GitHub Desktop.
Save otherjoel/4366960058983073ce01fa27f1a4d09d to your computer and use it in GitHub Desktop.
Benchmarking regular expressions against purpose-built functions in Racket
#lang racket/base
;; Comparing the speed of two methods for getting the first N words out of the string elements
;; of a tagged X-expression.
;;
;; Licensed under the Blue Oak Model License 1.0.0 (blueoakcouncil.org/license/1.0.0)
(require racket/list
txexpr)
;; Concatenate all the string elements out of a tagged x-expression (ignoring attributes)
(define (tx-strs xpr)
(cond
[(txexpr? xpr) (apply string-append (map tx-strs (get-elements xpr)))]
[(string? xpr) xpr]
[else ""]))
;; The “simple, short” version which uses regular expressions to do all the work.
(define (first-words-regex txs n)
;; See: https://regex101.com/r/IpOqXy/23
(define word-regex
#px"(?<=\\s|^|\\b)(?:[-'‘’!.%$#&\\/]\\b|\\b[-'‘’!.%$#&\\/]|[A-Za-z0-9]|\\([A-Za-z0-9]+\\))+(?=\\s|$|\\b)")
(define word-list (regexp-match* word-regex (tx-strs (first txs))))
(define words (apply string-append (add-between (take word-list (min (length word-list) n)) " ")))
(cond [(>= (length word-list) n) words]
[(null? (rest txs)) words]
[else (string-append words " " (first-words-regex (rest txs) (- n (length word-list))))]))
;; The “long, complicated” version which manually examines one character at a time
(define (first-words txprs words-needed)
(define punc-allowed-in-word '(#\- #\' #\% #\$ #\‘ #\’ #\# #\& #\/ #\. #\!))
(define (word-boundary? c) (or (char-whitespace? c) (equal? c #\null) (eof-object? c)))
(define (word-char? c) (or (char-alphabetic? c) (char-numeric? c)))
(define in (open-input-string (tx-strs (first txprs))))
(define out (open-output-string))
(define words-found
(let loop ([words-found 0] [last-c #\null] [last-c-in-word? #f])
(define c (read-char in))
(cond [(equal? words-found words-needed) words-found]
[(eof-object? c)
(cond [(positive? words-found) (if last-c-in-word? (+ 1 words-found) words-found)]
[else 0])]
[else
(define-values (write-this-char? new-word-count c-in-word?)
(cond
;; Spaces increment the word count if the previous character was part of,
;; or adjacent to, a word
[(and (char-whitespace? c) last-c-in-word?)
(values (if (equal? words-needed (+ 1 words-found)) #f #t) (+ 1 words-found) #f)]
;; Some punctuation survives if the previous or next char is part of a word
[(member c punc-allowed-in-word)
(cond [(or (word-char? last-c) (word-char? (peek-char in)))
(values #t words-found #t)]
[else (values #f words-found #f)])]
[(word-char? c)
(values #t words-found #t)]
;; If c is a non-whitespace non-allowed character that immediately follows a word,
;; do not write it out but count it as being part of the word.
[(and (not (word-char? c)) (not (char-whitespace? c)) last-c-in-word?)
(values #f words-found #t)]
[else (values #f words-found #f)]))
(cond [write-this-char? (write-char c out)])
(loop new-word-count c c-in-word?)])))
(define words (get-output-string out))
(cond [(equal? words-found words-needed) words]
[(equal? '() (rest txprs)) words]
[else (string-append words " " (first-words (rest txprs) (- words-needed words-found)))]))
;; This is where we compare their speeds. Each test runs both functions 10,000 times.
(module+ test
(require sugar/debug)
(define (compare-with txs)
(time-repeat* 10000
(first-words txs 5)
(first-words-regex txs 5)))
(define short-txexprs
'((p "She counted (" (em "one, two") "— silently, eyes unblinking")))
(define another-short-txexprs
'((p "“Stop!” she called.") (p "(She was never one to be silent.)")))
(define longer-txexprs
'((p (span [[class "newthought"]] "In a") " 2005 episode of This American Life, titled “A Little Bit of Knowledge”, " (a ((href "http://www.thisamericanlife.org/radio-archives/episode/293/a-little-bit-of-knowledge?act=3#play")) "we hear the story of an electrician") ", who, despite being fairly smart, nonetheless deludes himself into thinking he has disproved Einstein’s theory of relativity. The fact that he’s reasonably intelligent makes it all the harder for him to see his own error, even when confronted by actual experts.") (blockquote (b "Bob Berenz:") " All right, in this point I have to be completely honest. I did write a paper early on, and I submitted it to a physics site. And it was summarily rejected out of hand. But I did learn an important lesson, that physicists and what’s being done by them is very complicated, very mathematically intensive. What I’ve got is none of that, so it completely, almost in reverse, goes over their heads.”") (p "I listened to all this with great interest, because I fear that I myself might be just like Bob. In fact, I waver between thinking I am " (em "in danger") " of " (em "becoming like Bob") ", and thinking I " (em "have been like Bob for years") " and am " (em "just now realizing it") ". I have big, long-nursed theories of my own about subjects I have no formal training in, and I even write & tweet on those subjects, with few to no disclaimers." (sup (a ((href "#2_fndef") (id "2_fn1")) "(2)")))))
(compare-with short-txexprs)
(compare-with another-short-txexprs) ; [slightly different results]
(compare-with longer-txexprs))
❯ racket -v
Welcome to Racket v7.2.
❯ raco test 0-test-func.rkt
raco test: (submod "test-func.rkt" test)
cpu time: 145 real time: 145 gc time: 9
cpu time: 163 real time: 164 gc time: 1
"She counted one two silently"
"She counted one two silently"
cpu time: 121 real time: 121 gc time: 0
cpu time: 194 real time: 194 gc time: 0
"Stop! she called. She was"
"Stop she called. She was"
cpu time: 351 real time: 352 gc time: 1
cpu time: 1321 real time: 1324 gc time: 2
"In a 2005 episode of"
"In a 2005 episode of"
❯ raco test 0-test-func.rkt
raco test: (submod "test-func.rkt" test)
cpu time: 143 real time: 144 gc time: 9
cpu time: 161 real time: 161 gc time: 1
"She counted one two silently"
"She counted one two silently"
cpu time: 120 real time: 120 gc time: 0
cpu time: 194 real time: 195 gc time: 1
"Stop! she called. She was"
"Stop she called. She was"
cpu time: 351 real time: 351 gc time: 0
cpu time: 1322 real time: 1326 gc time: 4
"In a 2005 episode of"
"In a 2005 episode of"
❯ raco test 0-test-func.rkt
raco test: (submod "test-func.rkt" test)
cpu time: 146 real time: 147 gc time: 9
cpu time: 182 real time: 183 gc time: 2
"She counted one two silently"
"She counted one two silently"
cpu time: 121 real time: 121 gc time: 0
cpu time: 193 real time: 193 gc time: 0
"Stop! she called. She was"
"Stop she called. She was"
cpu time: 341 real time: 342 gc time: 1
cpu time: 1316 real time: 1319 gc time: 1
"In a 2005 episode of"
"In a 2005 episode of"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment