Last active
February 10, 2023 02:23
-
-
Save ericjster/3acadaa422dd889df18846d4d9e7ae86 to your computer and use it in GitHub Desktop.
Golang regexp timing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// Looking at timing of regex. | |
// Regex example from Damian Gryski's article: | |
// https://medium.com/@dgryski/speeding-up-regexp-matching-with-ragel-4727f1c16027 | |
// | |
// Let's measure a hand-coded regex and compare with the regex library and ragel | |
// state machine. In this case, the hand-coded version is less than 10% of the | |
// cost. Obviously 1 line of code is more understandable and maintainable than | |
// 50 lines with hardcoded constants, but it might be useful to know what could | |
// be achieved. | |
// go test -test.bench=. | |
// goos: darwin | |
// goarch: amd64 | |
// BenchmarkRegex-8 10000000 204 ns/op | |
// BenchmarkHand-8 50000000 22.7 ns/op | |
// BenchmarkHand2-8 100000000 17.8 ns/op | |
// PASS | |
// Loop over optimization flags. -B = disable bounds check. | |
// | |
// for opt in "" -gcflags=-B; do echo "opt=$opt"; go test $opt -test.bench=.; done | |
// opt= | |
// goos: darwin | |
// goarch: amd64 | |
// BenchmarkRegex-8 10000000 208 ns/op | |
// BenchmarkHand-8 100000000 22.1 ns/op | |
// BenchmarkHand2-8 100000000 18.1 ns/op | |
// BenchmarkHand3-8 30000000 52.9 ns/op | |
// PASS | |
// ok _/Users/eric/tmp/191116_regex_timing 8.025s | |
// opt=-gcflags=-B | |
// goos: darwin | |
// goarch: amd64 | |
// BenchmarkRegex-8 10000000 210 ns/op | |
// BenchmarkHand-8 100000000 21.9 ns/op | |
// BenchmarkHand2-8 100000000 19.8 ns/op | |
// BenchmarkHand3-8 30000000 52.7 ns/op | |
// PASS | |
// ok _/Users/eric/tmp/191116_regex_timing 8.202s | |
import ( | |
"regexp" | |
"strings" | |
"testing" | |
) | |
// sample input line from the blog post | |
var data = []byte(`Jan 18 06:41:30 corecompute sshd[42327]: Failed keyboard-interactive/pam for root from 112.100.68.182 port 48803 ssh2`) | |
var reSSHD = regexp.MustCompile(`sshd\[\d{5}\]:\s*Failed`) | |
func BenchmarkRegex(b *testing.B) { | |
var hits int | |
for i := 0; i < b.N; i++ { | |
if reSSHD.Match(data) { | |
hits++ | |
} | |
} | |
if hits <= 0 { | |
b.Fatalf("No hits") | |
} | |
} | |
func BenchmarkHand(b *testing.B) { | |
var hits int | |
for i := 0; i < b.N; i++ { | |
if matchSSHDHand(data) { | |
hits++ | |
} | |
} | |
if hits <= 0 { | |
b.Fatalf("No hits") | |
} | |
} | |
func BenchmarkHand2(b *testing.B) { | |
var hits int | |
for i := 0; i < b.N; i++ { | |
if matchSSHDHand2(data) { | |
hits++ | |
} | |
} | |
if hits <= 0 { | |
b.Fatalf("No hits") | |
} | |
} | |
func BenchmarkHand3(b *testing.B) { | |
var hits int | |
for i := 0; i < b.N; i++ { | |
if matchSSHDHand3(data) { | |
hits++ | |
} | |
} | |
if hits <= 0 { | |
b.Fatalf("No hits") | |
} | |
} | |
func isdigit(b byte) bool { | |
return '0' <= b && b <= '9' | |
} | |
func isspace(b byte) bool { | |
return b == ' ' || b == '\t' | |
} | |
func matchSSHDHand(data []byte) bool { | |
// Hand code the regex match. | |
// regex: "sshd\[\d{5}\]:\s*Failed" | |
n := len(data) | |
j := -1 | |
_ = data[n-1] // Maybe help with overflow check | |
for i := 0; i < n-12; i++ { | |
if data[i+0] == 's' && | |
data[i+1] == 's' && | |
data[i+2] == 'h' && | |
data[i+3] == 'd' && | |
data[i+4] == '[' && | |
isdigit(data[i+5]) && | |
isdigit(data[i+6]) && | |
isdigit(data[i+7]) && | |
isdigit(data[i+8]) && | |
isdigit(data[i+9]) && | |
data[i+10] == ']' && | |
data[i+11] == ':' { | |
j = i | |
break | |
} | |
} | |
if j < 0 { | |
return false | |
} | |
for ; isspace(data[j]) && j < n-6; j++ { | |
} | |
for i := j; i < n-6; i++ { | |
if data[i+0] == 'F' && | |
data[i+1] == 'a' && | |
data[i+2] == 'i' && | |
data[i+3] == 'l' && | |
data[i+4] == 'e' && | |
data[i+5] == 'd' { | |
return true | |
} | |
} | |
return false | |
} | |
func matchSSHDHand2(data []byte) bool { | |
// Hand code the regex match. | |
// regex: "sshd\[\d{5}\]:\s*Failed" | |
n := len(data) | |
_ = data[n-1] // Maybe help with overflow check | |
for i := 0; i < n-12; i++ { | |
if data[i+0] == 's' && | |
data[i+1] == 's' && | |
data[i+2] == 'h' && | |
data[i+3] == 'd' && | |
data[i+4] == '[' && | |
isdigit(data[i+5]) && | |
isdigit(data[i+6]) && | |
isdigit(data[i+7]) && | |
isdigit(data[i+8]) && | |
isdigit(data[i+9]) && | |
data[i+10] == ']' && | |
data[i+11] == ':' { | |
var j int | |
for j = i + 12; isspace(data[j]) && j < n-6; j++ { | |
} | |
for ; j < n-6; j++ { | |
if data[j+0] == 'F' && | |
data[j+1] == 'a' && | |
data[j+2] == 'i' && | |
data[j+3] == 'l' && | |
data[j+4] == 'e' && | |
data[j+5] == 'd' { | |
return true | |
} | |
} | |
} | |
} | |
return false | |
} | |
func matchSSHDHand3(data []byte) bool { | |
// Hand code the regex match. | |
// regex: "sshd\[\d{5}\]:\s*Failed" | |
sdata := string(data) | |
n := len(sdata) | |
_ = sdata[n-1] // Maybe help with overflow check | |
for { | |
// Note that strings.Index uses SSE and AVX instructions. | |
i := strings.Index(sdata, "sshd[") | |
if i < 0 || i+5+7 >= n { | |
return false | |
} | |
i += 5 | |
if isdigit(sdata[i+0]) && | |
isdigit(sdata[i+1]) && | |
isdigit(sdata[i+2]) && | |
isdigit(sdata[i+3]) && | |
isdigit(sdata[i+4]) && | |
sdata[i+5] == ']' && | |
sdata[i+6] == ':' { | |
i += 7 | |
for ; isspace(sdata[i]) && i+5+1 < n; i++ { | |
} | |
if sdata[i+0] == 'F' && | |
sdata[i+1] == 'a' && | |
sdata[i+2] == 'i' && | |
sdata[i+3] == 'l' && | |
sdata[i+4] == 'e' && | |
sdata[i+5] == 'd' { | |
return true | |
} | |
} | |
sdata = sdata[5:] | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment