Last active
July 7, 2023 18:54
-
-
Save oxalica/c720be3fe00f4880e642cccc705f73bb to your computer and use it in GitHub Desktop.
regex-automata 0.3 performance regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "regex_automata_regression_test" | |
version = "0.1.0" | |
edition = "2021" | |
[dependencies] | |
criterion = "0.5.1" | |
once_cell = "1.18.0" | |
regex-automata = "0.3.1" | |
regex-automata_0_2 = { package = "regex-automata", version = "0.2.0" } | |
[profile.release] | |
# lto = "fat" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use criterion::Criterion; | |
use once_cell::sync::Lazy; | |
fn build_dfa_0_3(pats: &[&str]) -> regex_automata::dfa::dense::DFA<Vec<u32>> { | |
use regex_automata::dfa::{dense, StartKind}; | |
use regex_automata::nfa::thompson::Config as NfaConfig; | |
use regex_automata::util::syntax::Config as SyntaxConfig; | |
dense::Builder::new() | |
.configure( | |
dense::Config::new() | |
.minimize(true) | |
.start_kind(StartKind::Anchored), | |
) | |
.syntax(SyntaxConfig::new().unicode(false).utf8(false)) | |
.thompson(NfaConfig::new().utf8(false).shrink(true)) | |
.build_many(pats) | |
.unwrap() | |
} | |
fn build_dfa_0_2(pats: &[&str]) -> regex_automata_0_2::dfa::dense::DFA<Vec<u32>> { | |
use regex_automata_0_2::dfa::dense; | |
use regex_automata_0_2::nfa::thompson::Config as NfaConfig; | |
use regex_automata_0_2::SyntaxConfig; | |
dense::Builder::new() | |
.configure(dense::Config::new().minimize(true).anchored(true)) | |
.syntax(SyntaxConfig::new().unicode(false).utf8(false)) | |
.thompson(NfaConfig::new().utf8(false).shrink(true)) | |
.build_many(pats) | |
.unwrap() | |
} | |
macro_rules! regex_dfa { | |
($dfa03:ident, $dfa02:ident { $($tok:path = $regex:literal,)* }) => { | |
static $dfa03: Lazy<regex_automata::dfa::dense::DFA<Vec<u32>>> = Lazy::new(|| { | |
build_dfa_0_3(&[$($regex),*]) | |
}); | |
static $dfa02: Lazy<regex_automata_0_2::dfa::dense::DFA<Vec<u32>>> = Lazy::new(|| { | |
build_dfa_0_2(&[$($regex),*]) | |
}); | |
}; | |
} | |
regex_dfa! { | |
DFA03, DFA02 { | |
SPACE = r"[ \r\n\t]+", | |
COMMENT = r"#.*|/\*([^*]|\*[^/])*\*/", | |
PATH_START = r"(~|[a-zA-Z0-9._+-]*)/([a-zA-Z0-9._+-][/a-zA-Z0-9._+-]*)?\$\{", | |
PATH = r"(~|[a-zA-Z0-9._+-]*)/[a-zA-Z0-9._+-][/a-zA-Z0-9._+-]*", | |
SEARCH_PATH = r"<[a-zA-Z0-9._+-]+(/[a-zA-Z0-9._+-]+)*>", | |
FLOAT = r"(\d+\.\d*|\.\d+)([Ee][+-]?\d+)?", | |
INT = r"\d+", | |
URI = r"[a-zA-Z][a-zA-Z0-9.+-]*:[a-zA-Z0-9%/?:@&=+$,_.!~*'-]+", | |
IDENT = r"[a-zA-Z_][a-zA-Z0-9_'-]*", | |
DQUOTE = "\"", | |
QUOTE2 = r"''", | |
DOT3 = r"\.\.\.", | |
MINUS_GT = r"->", | |
OR2 = r"\|\|", | |
AND2 = r"&&", | |
EQ2 = r"==", | |
NOT_EQ = r"!=", | |
LT_EQ = r"<=", | |
GT_EQ = r">=", | |
SLASH2 = r"//", | |
PLUS2 = r"\+\+", | |
DOLLAR_L_CURLY = r"\$\{", | |
L_CURLY = r"\{", | |
R_CURLY = r"}", | |
L_BRACK = r"\[", | |
R_BRACK = r"]", | |
L_PAREN = r"\(", | |
R_PAREN = r"\)", | |
AT = r"@", | |
COLON = r":", | |
SEMICOLON = r";", | |
COMMA = r",", | |
QUESTION = r"\?", | |
PLUS = r"\+", | |
MINUS = r"-", | |
STAR = r"\*", | |
DOT = r"\.", | |
SLASH = r"/", | |
LT = r"<", | |
GT = r">", | |
BANG = r"!", | |
EQ = r"=", | |
} | |
} | |
pub fn lex_0_3(src: &[u8]) -> Vec<(u32, u32)> { | |
use regex_automata::dfa::Automaton; | |
use regex_automata::{Anchored, Input}; | |
let mut out = Vec::new(); | |
let mut offset = 0; | |
while offset < src.len() { | |
let rest = &src[offset..]; | |
let input = Input::new(rest).anchored(Anchored::Yes); | |
let Some(m) = DFA03.try_search_fwd(&input).expect("No quit byte") else { | |
offset += 1; | |
continue; | |
}; | |
out.push((m.pattern().as_u32(), m.offset() as u32)); | |
offset += m.offset(); | |
} | |
out | |
} | |
pub fn lex_0_2(src: &[u8]) -> Vec<(u32, u32)> { | |
use regex_automata_0_2::dfa::Automaton; | |
let mut out = Vec::new(); | |
let mut offset = 0; | |
while offset < src.len() { | |
let rest = &src[offset..]; | |
let Some(m) = DFA02.find_leftmost_fwd(rest).expect("No quit byte") else { | |
offset += 1; | |
continue; | |
}; | |
out.push((m.pattern().as_u32(), m.offset() as u32)); | |
offset += m.offset(); | |
} | |
out | |
} | |
fn bench_lex(c: &mut Criterion) { | |
let path = std::env::var("BENCH_FILE").expect("missing env var BENCH_FILE"); | |
let src = std::fs::read_to_string(path).expect("failed to read test file"); | |
// Their results are the same. | |
assert_eq!(lex_0_2(src.as_bytes()), lex_0_3(src.as_bytes())); | |
c.bench_function("lex_0_3", |b| { | |
b.iter(|| lex_0_3(src.as_bytes())); | |
}); | |
c.bench_function("lex_0_2", |b| { | |
b.iter(|| lex_0_2(src.as_bytes())); | |
}); | |
} | |
criterion::criterion_group!(benches, bench_lex); | |
criterion::criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Platform: AMD Ryzen 7 5700G, x86_64-linux | |
Env var `BENCH_FILE` points to the downloaded file from https://raw.githubusercontent.com/NixOS/nixpkgs/f9e94676ce6c7531c44d38da61d2669ebec0f603/pkgs/top-level/all-packages.nix | |
It has SHA256 8d92a88a402b57099f18d14ece5b3bfcf0e1280dd702f6802526ff2cab2a0d4f | |
Command: `BENCH_FILE=/path/to/all-packages.nix cargo run --release -- --bench` | |
Without lto: (+211%) | |
lex_0_3 time: [16.133 ms 16.170 ms 16.199 ms] | |
lex_0_2 time: [5.1895 ms 5.2005 ms 5.2110 ms] | |
With lto="fat": (+77%) | |
lex_0_3 time: [7.6775 ms 7.6850 ms 7.6944 ms] | |
lex_0_2 time: [4.3312 ms 4.3333 ms 4.3357 ms] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment