Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Ahead-of-time compiled regular expressions in Rust
name dynamic regexes native regexes
--------------------------------------------------------------------------------------------------------
literal 434 ns/iter (+/- 3) 134 ns/iter (+/- 2)
not_literal 1930 ns/iter (+/- 9) 1051 ns/iter (+/- 6)
match_class 2457 ns/iter (+/- 12) 1391 ns/iter (+/- 7)
match_class_in_range 2707 ns/iter (+/- 12) 1433 ns/iter (+/- 3)
replace_all 3735 ns/iter (+/- 34) 1271 ns/iter (+/- 14)
anchored_short_non_match 900 ns/iter (+/- 6) 454 ns/iter (+/- 2)
anchored_long_non_match 8155 ns/iter (+/- 634) 5410 ns/iter (+/- 64)
anchored_short_match 472 ns/iter (+/- 16) 149 ns/iter (+/- 1)
anchored_long_match 487 ns/iter (+/- 11) 154 ns/iter (+/- 2)
one_pass_short_a 2108 ns/iter (+/- 11) 1329 ns/iter (+/- 16)
one_pass_short_a_not 2577 ns/iter (+/- 81) 1635 ns/iter (+/- 52)
one_pass_short_b 1509 ns/iter (+/- 6) 913 ns/iter (+/- 2)
one_pass_short_b_not 1997 ns/iter (+/- 27) 1141 ns/iter (+/- 2)
one_pass_long_prefix 1133 ns/iter (+/- 7) 395 ns/iter (+/- 8)
one_pass_long_prefix_not 1145 ns/iter (+/- 4) 402 ns/iter (+/- 3)
easy0_32 5199 ns/iter (+/- 111) = 6 MB/s 262 ns/iter (+/- 1) = 122 MB/s
easy0_1K 7533 ns/iter (+/- 87) = 135 MB/s 1281 ns/iter (+/- 159) = 799 MB/s
easy0_32K 86868 ns/iter (+/- 783) = 377 MB/s 33968 ns/iter (+/- 2241) = 964 MB/s
easy1_32 5714 ns/iter (+/- 174) = 5 MB/s 372 ns/iter (+/- 79) = 86 MB/s
easy1_1K 9125 ns/iter (+/- 988) = 112 MB/s 1890 ns/iter (+/- 622) = 541 MB/s
easy1_32K 126966 ns/iter (+/- 4464) = 258 MB/s 51002 ns/iter (+/- 3982) = 642 MB/s
medium_32 6196 ns/iter (+/- 124) = 5 MB/s 809 ns/iter (+/- 38) = 39 MB/s
medium_1K 37968 ns/iter (+/- 637) = 26 MB/s 17165 ns/iter (+/- 210) = 59 MB/s
medium_32K 1053551 ns/iter (+/- 5442) = 31 MB/s 541218 ns/iter (+/- 4247) = 60 MB/s
hard_32 7283 ns/iter (+/- 43) = 4 MB/s 1342 ns/iter (+/- 16) = 23 MB/s
hard_1K 60998 ns/iter (+/- 423) = 16 MB/s 33377 ns/iter (+/- 240) = 30 MB/s
hard_32K 1779833 ns/iter (+/- 5978) = 18 MB/s 1061047 ns/iter (+/- 9319) = 30 MB/s
#![feature(phase)]
#![feature(phase)]
#![no_std]
#![feature(globs)]
#[phase(plugin, link)]
extern crate std = "std#0.11.0-pre";
extern crate native = "native#0.11.0-pre";
extern crate regex;
#[phase(plugin)]
extern crate regex_macros;
use std::prelude::*;
fn main() {
let _ =
{
#[allow(dead_code)]
static CAP_NAMES: &'static [Option<&'static str>] = &[];
#[allow(dead_code)]
fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
start: uint, end: uint) -> Vec<Option<uint>> {
#![allow(unused_imports)]
#![allow(unused_mut)]
use regex::native::{MatchKind, Exists, Location, Submatches,
StepState, StepMatchEarlyReturn,
StepMatch, StepContinue, CharReader,
find_prefix};
return Nfa{which: which,
input: input,
ic: 0,
chars: CharReader::new(input),}.run(start, end);
type Captures = [Option<uint>, ..2u];
struct Nfa<'t> {
which: MatchKind,
input: &'t str,
ic: uint,
chars: CharReader<'t>,
}
impl <'t> Nfa<'t> {
#[allow(unused_variable)]
fn run(&mut self, start: uint, end: uint) ->
Vec<Option<uint>> {
let mut matched = false;
let prefix_bytes: &[u8] = &[104, 105, 112, 112, 111];
let mut clist = &mut Threads::new(self.which);
let mut nlist = &mut Threads::new(self.which);
let mut groups =
[::std::option::None, ::std::option::None];
self.ic = start;
let mut next_ic = self.chars.set(start);
while self.ic <= end {
if clist.size == 0 {
if matched { break }
if clist.size == 0 {
let haystack =
self.input.as_bytes().slice_from(self.ic);
match find_prefix(prefix_bytes, haystack)
{
None => break ,
Some(i) => {
self.ic += i;
next_ic = self.chars.set(self.ic);
}
}
}
}
if clist.size == 0 || (!false && !matched) {
self.add(clist, 0, &mut groups)
}
self.ic = next_ic;
next_ic = self.chars.advance();
match &mut range(0, clist.size) {
__i =>
loop {
match __i.next() {
None => break ,
Some(mut __value) => {
let i = __value;
{
let pc = clist.pc(i);
let step_state =
self.step(&mut groups,
nlist,
clist.groups(i),
pc);
match step_state {
StepMatchEarlyReturn =>
return {
let mut _temp =
::std::vec::Vec::new();
_temp.push(Some(0u));
_temp.push(Some(0u));
_temp
},
StepMatch => {
matched = true;
break
},
StepContinue => { }
}
}
}
}
}
}
::std::mem::swap(&mut clist, &mut nlist);
nlist.empty();
}
match self.which {
Exists if matched => {
let mut _temp = ::std::vec::Vec::new();
_temp.push(Some(0u));
_temp.push(Some(0u));
_temp
},
Exists => {
let mut _temp = ::std::vec::Vec::new();
_temp.push(None);
_temp.push(None);
_temp
},
Location | Submatches =>
groups.iter().map(|x| *x).collect()
}
}
#[allow(unused_variable)]
#[inline]
fn step(&self, groups: &mut Captures, nlist: &mut Threads,
caps: &mut Captures, pc: uint) -> StepState {
match pc {
0u => { },
1u => {
if self.chars.prev == Some('h') {
self.add(nlist, 2u, caps);
}
},
2u => {
if self.chars.prev == Some('i') {
self.add(nlist, 3u, caps);
}
},
3u => {
if self.chars.prev == Some('p') {
self.add(nlist, 4u, caps);
}
},
4u => {
if self.chars.prev == Some('p') {
self.add(nlist, 5u, caps);
}
},
5u => {
if self.chars.prev == Some('o') {
self.add(nlist, 6u, caps);
}
},
6u => {
if self.chars.prev.is_some() {
let c = self.chars.prev.unwrap();
let found =
match c {
'0' ..'9' => true,
'a' ..'f' => true,
'x' ..'z' => true,
_ => false
};
if found { self.add(nlist, 7u, caps); }
}
},
7u => { },
8u => { },
9u => {
match self.which {
Exists => { return StepMatchEarlyReturn },
Location => {
groups[0] = caps[0];
groups[1] = caps[1];
return StepMatch
},
Submatches => {
match &mut groups.mut_iter().zip(caps.iter())
{
__i =>
loop {
match __i.next() {
None => break ,
Some(mut __value) => {
let (slot, val) =
__value;
{ *slot = *val; }
}
}
}
}
return StepMatch
}
}
},
_ => { }
}
StepContinue
}
fn add(&self, nlist: &mut Threads, pc: uint,
groups: &mut Captures) {
if nlist.contains(pc) { return }
match pc {
0u => {
nlist.add_empty(0u);
match self.which {
Submatches | Location => {
let old = groups[0u];
groups[0u] = Some(self.ic);
self.add(nlist, 1u, &mut *groups);
groups[0u] = old;
},
Exists => {
self.add(nlist, 1u, &mut *groups);
}
}
},
1u => nlist.add(1u, &*groups),
2u => nlist.add(2u, &*groups),
3u => nlist.add(3u, &*groups),
4u => nlist.add(4u, &*groups),
5u => nlist.add(5u, &*groups),
6u => nlist.add(6u, &*groups),
7u => {
nlist.add_empty(7u);
self.add(nlist, 6u, &mut *groups);
self.add(nlist, 8u, &mut *groups);
},
8u => {
nlist.add_empty(8u);
match self.which {
Submatches | Location => {
let old = groups[1u];
groups[1u] = Some(self.ic);
self.add(nlist, 9u, &mut *groups);
groups[1u] = old;
},
Exists => {
self.add(nlist, 9u, &mut *groups);
}
}
},
9u => nlist.add(9u, &*groups),
_ => { }
}
}
}
struct Thread {
pc: uint,
groups: Captures,
}
struct Threads {
which: MatchKind,
queue: [Thread, ..10u],
sparse: [uint, ..10u],
size: uint,
}
impl Threads {
fn new(which: MatchKind) -> Threads {
Threads{which: which,
queue: unsafe { ::std::mem::uninitialized() },
sparse:
unsafe { ::std::mem::uninitialized() },
size: 0,}
}
#[inline]
fn add(&mut self, pc: uint, groups: &Captures) {
let t = &mut self.queue[self.size];
t.pc = pc;
match self.which {
Exists => { },
Location => {
t.groups[0] = groups[0];
t.groups[1] = groups[1];
},
Submatches => {
match &mut t.groups.mut_iter().zip(groups.iter())
{
__i =>
loop {
match __i.next() {
None => break ,
Some(mut __value) => {
let (slot, val) = __value;
{ *slot = *val; }
}
}
}
}
}
}
self.sparse[pc] = self.size;
self.size += 1;
}
#[inline]
fn add_empty(&mut self, pc: uint) {
self.queue[self.size].pc = pc;
self.sparse[pc] = self.size;
self.size += 1;
}
#[inline]
fn contains(&self, pc: uint) -> bool {
let s = self.sparse[pc];
s < self.size && self.queue[s].pc == pc
}
#[inline]
fn empty(&mut self) { self.size = 0; }
#[inline]
fn pc(&self, i: uint) -> uint { self.queue[i].pc }
#[inline]
fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
&mut self.queue[i].groups
}
}
}
::regex::native::Native(::regex::native::Native{original:
"hippo[a-fx-z0-9]+",
names: CAP_NAMES,
prog: exec,})
};
}
// Currently, the `phase` feature must be enabled in order to import a crate
// that defines a syntax extension.
#![feature(phase)]
// The `phase` attribute is used here to indicate that the `factorial` crate
// provides a syntax extension.
// It's also possible for `factorial` to provide things other than a syntax
// extension, in which case, `#[phase(plugin, link)]` is required.
#[phase(plugin)] extern crate factorial;
fn main() {
println!("{}", factorial!());
// Output:
// 120
}
#![feature(phase)]
#[phase(plugin)] extern crate factorial_arg;
fn main() {
println!("{}", factorial!(10u));
// Output:
// 3628800
}
#![feature(phase)]
#[phase(plugin)] extern crate factorial_no_quote;
fn main() {
println!("{}", factorial!());
// Output:
// 120
}
#![feature(phase)]
extern crate regex;
#[phase(plugin)] extern crate regex_macros;
fn main() {
let re = regex!(r"\d{4}-\d{2}-\d{2}");
println!("{}", re.find("Today's date is 2014-04-21."));
// Output:
// Some((16, 26))
}
extern crate regex;
use regex::Regex;
fn main() {
let re = match Regex::new(r"\d{4}-\d{2}-\d{2}") {
Ok(re) => re,
Err(err) => fail!("{}", err),
};
println!("{}", re.find("Today's date is 2014-04-21."));
// Output:
// Some((16, 26))
}
#![crate_type = "dylib"]
#![feature(plugin_registrar, managed_boxes, quote)]
extern crate rustc;
extern crate syntax;
use syntax::ast;
use syntax::codemap;
use syntax::ext::base::{ExtCtxt, MacResult, MacExpr};
use rustc::plugin::Registry;
#[plugin_registrar]
pub fn plugin_registrar(reg: &mut Registry) {
reg.register_macro("factorial", expand)
}
fn expand(cx: &mut ExtCtxt, _: codemap::Span, _: &[ast::TokenTree]) -> Box<MacResult> {
let answer = factorial(5 as u64);
MacExpr::new(quote_expr!(cx, $answer))
}
fn factorial(n: u64) -> u64 {
// Brings the 'product' method from the MultiplicativeIterator trait
// into scope.
use std::iter::MultiplicativeIterator;
range(2, n+1).product()
}
#![crate_type = "dylib"]
#![feature(plugin_registrar, quote)]
extern crate rustc;
extern crate syntax;
use syntax::ast;
use syntax::codemap;
use syntax::ext::base::{ExtCtxt, MacResult, MacExpr, DummyResult};
use syntax::parse;
use syntax::parse::token;
use rustc::plugin::Registry;
#[plugin_registrar]
pub fn plugin_registrar(reg: &mut Registry) {
reg.register_macro("factorial", expand)
}
fn expand(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) -> Box<MacResult> {
let n = match parse(cx, tts) {
Some(n) => n,
None => return DummyResult::expr(sp),
};
let answer = factorial(n);
MacExpr::new(quote_expr!(cx, $answer))
}
fn factorial(n: u64) -> u64 {
// Brings the 'product' method from the MultiplicativeIterator trait
// into scope.
use std::iter::MultiplicativeIterator;
range(2, n+1).product()
}
fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<u64> {
use syntax::print::pprust;
let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(),
Vec::from_slice(tts));
// The `expand_expr` method is called so that any macro calls in the
// parsed expression are expanded. This for example allows us to write
// `factorial!(some_other_macro!(10u))`.
let arg = cx.expand_expr(parser.parse_expr());
match arg.node {
ast::ExprLit(spanned) => {
match spanned.node {
ast::LitUint(n, _) => {
if !parser.eat(&token::EOF) {
cx.span_err(parser.span,
"expected only one integer literal");
return None
}
return Some(n)
}
_ => {}
}
}
_ => {}
}
let err = format!("expected unsigned integer literal but got `{}`",
pprust::expr_to_str(arg));
cx.span_err(parser.span, err.as_slice());
None
}
// This attribute specifies that the Rust compiler will output a dynamic
// library when this file is compiled.
// Generally, many Rust libraries will *also* have a `#![crate_type = "rlib"]`
// attribute set, which means the Rust compiler will produce a static library.
// However, libraries which provide syntax extensions must be dynamically
// linked with `libsyntax`, so we elide the `rlib` and only produce a dynamic
// library.
#![crate_type = "dylib"]
// Enable the `plugin_registrar` feature (which is the compiler hook).
#![feature(plugin_registrar)]
extern crate rustc;
extern crate syntax;
use std::gc::{Gc, GC};
use syntax::ast;
use syntax::codemap;
use syntax::ext::base::{ExtCtxt, MacResult, MacExpr};
use rustc::plugin::Registry;
#[plugin_registrar]
pub fn plugin_registrar(reg: &mut Registry) {
reg.register_macro("factorial", expand)
}
fn expand(_: &mut ExtCtxt, sp: codemap::Span, _: &[ast::TokenTree]) -> Box<MacResult> {
let answer = factorial(5 as u64);
MacExpr::new(uint_literal(sp, answer))
}
fn factorial(n: u64) -> u64 {
// Brings the 'product' method from the MultiplicativeIterator trait
// into scope.
use std::iter::MultiplicativeIterator;
range(2, n+1).product()
}
fn uint_literal(sp: codemap::Span, n: u64) -> Gc<ast::Expr> {
let lit = ast::LitUint(n as u64, ast::TyU64);
let spanned = box(GC) codemap::respan(sp, lit);
dummy_expr(sp, ast::ExprLit(spanned))
}
fn dummy_expr(sp: codemap::Span, e: ast::Expr_) -> Gc<ast::Expr> {
box(GC) ast::Expr {
id: ast::DUMMY_NODE_ID,
node: e,
span: sp,
}
}
#![allow(dead_code)] fn main() {} struct Captures;
struct Regex {
prog: MaybeNative,
}
enum MaybeNative {
Dynamic(Vec<Inst>),
Native(fn(&str) -> Captures),
}
enum Inst { Match, OneChar, /* ... */ }
#![allow(dead_code)] fn main() {}
struct Regex {
insts: Vec<Inst>,
// (N.B. The real representation has a few more things, like the names of
// capture groups or pre-computed data for optimizations, but they are
// elided here to keep things simple.)
}
enum Inst { Match, OneChar, /* ... */ }
#![allow(dead_code)] fn main() {} struct Captures;
struct Regex {
prog: MaybeNative,
}
enum MaybeNative {
Dynamic(Vec<Inst>),
Native(fn(&str) -> Captures),
}
enum Inst { Match, OneChar, /* ... */ }
fn run_vm_dynamic(_: &[Inst], _: &str) -> Captures { Captures }
fn run_vm(re: &Regex, search: &str) -> Captures {
match re.prog {
Dynamic(ref insts) => run_vm_dynamic(insts.as_slice(), search),
Native(run_vm_native) => run_vm_native(search),
}
}
#![feature(macro_rules)]
macro_rules! try(
($e:expr) => (match $e { Ok(e) => e, Err(e) => return Err(e) })
)
fn test_try() -> Result<uint, &str> {
let num = try!(Err("error!"));
Ok(num) // never reached
}
fn main() {
println!("{}", test_try());
// Output:
// Err(error!)
}
#![feature(phase)]
extern crate regex;
#[phase(plugin)] extern crate regex_macros;
fn main() {
let re = regex!(r"(.*");
println!("{}", re.find("Today's date is 2014-04-21."));
}
#![feature(phase)]
extern crate regex;
#[phase(plugin)]
extern crate regex_macros;
fn main() {
let _ = regex!(r"hippo[a-fx-z0-9]+");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment