Skip to content

Instantly share code, notes, and snippets.

@ryanuber
Created December 25, 2015 06:03
Show Gist options
  • Save ryanuber/3e50b307e5afd86d877e to your computer and use it in GitHub Desktop.
Save ryanuber/3e50b307e5afd86d877e to your computer and use it in GitHub Desktop.
Fast text scanner in Go
diff --git a/bench_test.go b/bench_test.go
new file mode 100644
index 0000000..1431cfb
--- /dev/null
+++ b/bench_test.go
@@ -0,0 +1,174 @@
+package license
+
+import (
+ "bytes"
+ "strings"
+ "testing"
+)
+
+// The benchmarks in this file test the performance of our text scanner. This
+// text scanning must be repeated (at least in the current go-license code),
+// and as more licenses are added, this process becomes more expensive. The
+// performance we get out of these benchmarks is typically around 2x faster
+// than other comparable functions from the stdlib. This is a special case
+// scanner, and may not yield such results in all applications. For good
+// measure, we also benchmark the same data set with some stdlib functions.
+
+func BenchmarkScan(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if !scan(text, match) {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkScanNoMatch(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if scan(text, noMatch) {
+ b.Fatalf("should not match")
+ }
+ }
+}
+
+func BenchmarkStringsContains(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if !strings.Contains(text, match) {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkStringsContainsNoMatch(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if strings.Contains(text, noMatch) {
+ b.Fatalf("should not match")
+ }
+ }
+}
+
+func BenchmarkStringsIndex(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if strings.Index(text, match) == -1 {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkStringsIndexNoMatch(b *testing.B) {
+ for n := 0; n < b.N; n++ {
+ if strings.Index(text, noMatch) != -1 {
+ b.Fatalf("should not match")
+ }
+ }
+}
+
+func BenchmarkBytesContains(b *testing.B) {
+ subject := []byte(text)
+ pattern := []byte(match)
+ for n := 0; n < b.N; n++ {
+ if !bytes.Contains(subject, pattern) {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkBytesContainsNoMatch(b *testing.B) {
+ subject := []byte(text)
+ pattern := []byte(noMatch)
+ for n := 0; n < b.N; n++ {
+ if bytes.Contains(subject, pattern) {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkBytesIndex(b *testing.B) {
+ subject := []byte(text)
+ pattern := []byte(match)
+ for n := 0; n < b.N; n++ {
+ if bytes.Index(subject, pattern) == -1 {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+func BenchmarkBytesIndexNoMatch(b *testing.B) {
+ subject := []byte(text)
+ pattern := []byte(noMatch)
+ for n := 0; n < b.N; n++ {
+ if bytes.Index(subject, pattern) != -1 {
+ b.Fatalf("should match")
+ }
+ }
+}
+
+const match = "Persius maluisset eu per, per ad agam voluptua."
+const noMatch = "This text doesn't match anything."
+const text = `
+Lorem ipsum dolor sit amet, eos cu prima adipiscing interesset, ad accusam
+reformidans pro. Nostrum consequat dissentiet ad sed, nullam graeci voluptua
+usu et. Ea clita percipitur eloquentiam est. Per ea mutat nominavi, probo
+dolorem ex has.
+
+Ut cum saperet quaestio erroribus, ea alii integre cum. Vis cu assum soleat
+dissentiet, pro at modus malorum partiendo. Ei duis reprimique per, consulatu
+cotidieque qui eu. Diceret temporibus sea ne. Pro in alii falli inermis, consul
+mediocrem cu vel.
+
+Omnes viderer invenire vis ut, pro no iuvaret persecuti, hinc omnis aperiri pro
+ne. Eos minim denique pertinacia no, pro eripuit postulant in. Vix ad illud
+ornatus tractatos, aliquam facilis noluisse duo ne, mei aeque viris primis te.
+Diceret expetenda consulatu ea sed, ius mucius essent cu. Ex amet illud falli
+eos, mei eu falli lobortis aliquando.
+
+Eu homero omittam usu. Ut pertinacia disputando pro, vel cu wisi prompta, eu
+sit putant legendos. At sit consul mediocrem voluptatibus, pri assum inani in.
+Eu cum alia ullum dicam, his nemore adipisci inimicus ei, alii numquam
+urbanitas nam ex. Probo sanctus detracto eos ut. Ut sed cibo tacimates, prompta
+utroque sed ad.
+
+Sit ei odio nisl praesent. At democritum deterruisset eos. Ex eligendi
+qualisque eos. Est et elit dicam menandri. Pri adipiscing efficiantur ei,
+labitur feugiat in pri, quo in utroque repudiare.
+
+Postea fabellas recusabo vim ad, dolore singulis eu cum, mei autem gubergren
+vituperatoribus in. Ad aeque dolorem accusam pri, nec error saepe ex. Vim ad
+nostro labores, ea sumo animal salutandi has. Ius facete detraxit in, porro
+atqui his ne. Mel ei mucius hendrerit, nulla mazim posidonium usu cu, mei te
+percipit forensibus disputationi.
+
+Ex sea audire fabellas, est ut iusto constituto, nam at causae tractatos
+consectetuer. Duo dicant iisque suscipit ei, simul legere no has, cu quo vero
+lucilius dissentiet. Ut sed vidit probo dignissim. Melius gubergren ex sed. Usu
+novum ludus melius et. Id clita impetus mea.
+
+Ea mutat timeam feugait nam, no erant nostro similique nam. Et sea nulla
+bonorum, usu torquatos expetendis an. At eum expetenda adversarium, pri ne
+vocent postulant facilisis. Nec solum malorum noluisse cu, et nam sale propriae
+voluptatibus. Te mel dissentias eloquentiam, inani tantas delectus his et, an
+moderatius concludaturque sea. Eam oportere erroribus theophrastus in.
+
+Saepe iriure eripuit ea per. Nobis graeco constituam cu vix. Vim everti
+argumentum scripserit et. In vis accumsan expetendis, denique fabellas in eos.
+
+Est et nullam eligendi. Ei vim explicari abhorreant vituperatoribus, vidit
+dolore consulatu vim ea. Vitae debitis constituto sit ex. Cu soluta epicuri
+est, vix no tincidunt assueverit, per latine nonumes percipitur ea.
+
+Usu primis petentium consequat ne, ei verear eruditi sea. Quo an liber soluta
+molestiae, ne per debet principes, te mea animal feugiat adversarium. Dicunt
+volutpat adversarium an eum, ea epicuri recusabo expetendis quo. Mel causae
+omittam facilisis ea, consul audiam mei an. Ad pri utamur dolorum, mea alia
+noluisse id. Te sit rebum reprimique.
+
+Persius maluisset eu per, per ad agam voluptua. Facilis tincidunt sed eu. Quo
+ne eros nobis. Eros debet honestatis cum at, ius nihil percipit no.
+
+Harum vituperata pro cu, simul feugait an per. Cu vix alii adhuc, usu no omnis
+prodesset comprehensam. Ut vis facete moderatius. Vim posse maiestatis an, vel
+nulla quodsi ne, his ne dictas malorum vulputate. Eos ut quod latine dolorem.
+In singulis eleifend mel, ius ut offendit recteque.
+
+Aliquip oportere ei vix, vel ad integre rationibus. Vis et aliquip maiorum
+intellegat. Nullam doctus deserunt ei has. Ut everti volumus vis.
+`
diff --git a/license.go b/license.go
index d6d0a8f..a84b2f9 100644
--- a/license.go
+++ b/license.go
@@ -289,9 +289,34 @@ func (l *License) GuessType() error {
return nil
}
-// scan is a shortcut function to check for a literal match within a string
-// of text. Any text transformation should be done prior to calling this
-// function so that it need not be repeated for every check.
-func scan(text, match string) bool {
- return strings.Contains(text, match)
+// scan is a fast substring matcher.
+func scan(text, pattern string) bool {
+ // Get the last index in the pattern. Guard passing an empty
+ // pattern string by returning false.
+ last := len(pattern) - 1
+ if last == -1 {
+ return false
+ }
+
+ // The last possible starting index for a match. If out iterator
+ // passes this index, we can safely assume there are no matches.
+ lindex := len(text) - last
+
+OUTER:
+ for i := 0; i < lindex; i++ {
+ // Compare just the first byte of the pattern.
+ if text[i] != pattern[0] {
+ continue
+ }
+
+ // Reverse scan the rest of the chunk which possibly contains
+ // a match. This deprioritizes scanning common prefixes.
+ for n := last; n > 0; n-- {
+ if text[i+n] != pattern[n] {
+ continue OUTER
+ }
+ }
+ return true
+ }
+ return false
}
diff --git a/license_test.go b/license_test.go
index 5849fb0..59d64c3 100644
--- a/license_test.go
+++ b/license_test.go
@@ -226,3 +226,20 @@ func TestLicenseFiles(t *testing.T) {
t.Fatalf("expect %d files, got: %d", n, len(files))
}
}
+
+func TestScan(t *testing.T) {
+ input := "This is a test. Hello, world!"
+ tcases := map[string]bool{
+ "Hello": true, // Matches in the middle
+ "world!": true, // Matches at the end
+ "This": true, // Matches at the beginning
+ "nope": false, // Doesn't match
+ "": false, // Empty pattern guard
+ "This is longer than the input string": false,
+ }
+ for pattern, result := range tcases {
+ if out := scan(input, pattern); out != result {
+ t.Fatalf("bad result: %q (%v)", pattern, out)
+ }
+ }
+}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment