Skip to content

Instantly share code, notes, and snippets.

@oxalica
Last active July 7, 2023 18:54
Show Gist options
  • Save oxalica/c720be3fe00f4880e642cccc705f73bb to your computer and use it in GitHub Desktop.
Save oxalica/c720be3fe00f4880e642cccc705f73bb to your computer and use it in GitHub Desktop.
regex-automata 0.3 performance regression
[package]
name = "regex_automata_regression_test"
version = "0.1.0"
edition = "2021"
[dependencies]
criterion = "0.5.1"
once_cell = "1.18.0"
regex-automata = "0.3.1"
regex-automata_0_2 = { package = "regex-automata", version = "0.2.0" }
[profile.release]
# lto = "fat"
use criterion::Criterion;
use once_cell::sync::Lazy;
fn build_dfa_0_3(pats: &[&str]) -> regex_automata::dfa::dense::DFA<Vec<u32>> {
use regex_automata::dfa::{dense, StartKind};
use regex_automata::nfa::thompson::Config as NfaConfig;
use regex_automata::util::syntax::Config as SyntaxConfig;
dense::Builder::new()
.configure(
dense::Config::new()
.minimize(true)
.start_kind(StartKind::Anchored),
)
.syntax(SyntaxConfig::new().unicode(false).utf8(false))
.thompson(NfaConfig::new().utf8(false).shrink(true))
.build_many(pats)
.unwrap()
}
fn build_dfa_0_2(pats: &[&str]) -> regex_automata_0_2::dfa::dense::DFA<Vec<u32>> {
use regex_automata_0_2::dfa::dense;
use regex_automata_0_2::nfa::thompson::Config as NfaConfig;
use regex_automata_0_2::SyntaxConfig;
dense::Builder::new()
.configure(dense::Config::new().minimize(true).anchored(true))
.syntax(SyntaxConfig::new().unicode(false).utf8(false))
.thompson(NfaConfig::new().utf8(false).shrink(true))
.build_many(pats)
.unwrap()
}
macro_rules! regex_dfa {
($dfa03:ident, $dfa02:ident { $($tok:path = $regex:literal,)* }) => {
static $dfa03: Lazy<regex_automata::dfa::dense::DFA<Vec<u32>>> = Lazy::new(|| {
build_dfa_0_3(&[$($regex),*])
});
static $dfa02: Lazy<regex_automata_0_2::dfa::dense::DFA<Vec<u32>>> = Lazy::new(|| {
build_dfa_0_2(&[$($regex),*])
});
};
}
regex_dfa! {
DFA03, DFA02 {
SPACE = r"[ \r\n\t]+",
COMMENT = r"#.*|/\*([^*]|\*[^/])*\*/",
PATH_START = r"(~|[a-zA-Z0-9._+-]*)/([a-zA-Z0-9._+-][/a-zA-Z0-9._+-]*)?\$\{",
PATH = r"(~|[a-zA-Z0-9._+-]*)/[a-zA-Z0-9._+-][/a-zA-Z0-9._+-]*",
SEARCH_PATH = r"<[a-zA-Z0-9._+-]+(/[a-zA-Z0-9._+-]+)*>",
FLOAT = r"(\d+\.\d*|\.\d+)([Ee][+-]?\d+)?",
INT = r"\d+",
URI = r"[a-zA-Z][a-zA-Z0-9.+-]*:[a-zA-Z0-9%/?:@&=+$,_.!~*'-]+",
IDENT = r"[a-zA-Z_][a-zA-Z0-9_'-]*",
DQUOTE = "\"",
QUOTE2 = r"''",
DOT3 = r"\.\.\.",
MINUS_GT = r"->",
OR2 = r"\|\|",
AND2 = r"&&",
EQ2 = r"==",
NOT_EQ = r"!=",
LT_EQ = r"<=",
GT_EQ = r">=",
SLASH2 = r"//",
PLUS2 = r"\+\+",
DOLLAR_L_CURLY = r"\$\{",
L_CURLY = r"\{",
R_CURLY = r"}",
L_BRACK = r"\[",
R_BRACK = r"]",
L_PAREN = r"\(",
R_PAREN = r"\)",
AT = r"@",
COLON = r":",
SEMICOLON = r";",
COMMA = r",",
QUESTION = r"\?",
PLUS = r"\+",
MINUS = r"-",
STAR = r"\*",
DOT = r"\.",
SLASH = r"/",
LT = r"<",
GT = r">",
BANG = r"!",
EQ = r"=",
}
}
pub fn lex_0_3(src: &[u8]) -> Vec<(u32, u32)> {
use regex_automata::dfa::Automaton;
use regex_automata::{Anchored, Input};
let mut out = Vec::new();
let mut offset = 0;
while offset < src.len() {
let rest = &src[offset..];
let input = Input::new(rest).anchored(Anchored::Yes);
let Some(m) = DFA03.try_search_fwd(&input).expect("No quit byte") else {
offset += 1;
continue;
};
out.push((m.pattern().as_u32(), m.offset() as u32));
offset += m.offset();
}
out
}
pub fn lex_0_2(src: &[u8]) -> Vec<(u32, u32)> {
use regex_automata_0_2::dfa::Automaton;
let mut out = Vec::new();
let mut offset = 0;
while offset < src.len() {
let rest = &src[offset..];
let Some(m) = DFA02.find_leftmost_fwd(rest).expect("No quit byte") else {
offset += 1;
continue;
};
out.push((m.pattern().as_u32(), m.offset() as u32));
offset += m.offset();
}
out
}
fn bench_lex(c: &mut Criterion) {
let path = std::env::var("BENCH_FILE").expect("missing env var BENCH_FILE");
let src = std::fs::read_to_string(path).expect("failed to read test file");
// Their results are the same.
assert_eq!(lex_0_2(src.as_bytes()), lex_0_3(src.as_bytes()));
c.bench_function("lex_0_3", |b| {
b.iter(|| lex_0_3(src.as_bytes()));
});
c.bench_function("lex_0_2", |b| {
b.iter(|| lex_0_2(src.as_bytes()));
});
}
criterion::criterion_group!(benches, bench_lex);
criterion::criterion_main!(benches);
Platform: AMD Ryzen 7 5700G, x86_64-linux
Env var `BENCH_FILE` points to the downloaded file from https://raw.githubusercontent.com/NixOS/nixpkgs/f9e94676ce6c7531c44d38da61d2669ebec0f603/pkgs/top-level/all-packages.nix
It has SHA256 8d92a88a402b57099f18d14ece5b3bfcf0e1280dd702f6802526ff2cab2a0d4f
Command: `BENCH_FILE=/path/to/all-packages.nix cargo run --release -- --bench`
Without lto: (+211%)
lex_0_3 time: [16.133 ms 16.170 ms 16.199 ms]
lex_0_2 time: [5.1895 ms 5.2005 ms 5.2110 ms]
With lto="fat": (+77%)
lex_0_3 time: [7.6775 ms 7.6850 ms 7.6944 ms]
lex_0_2 time: [4.3312 ms 4.3333 ms 4.3357 ms]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment