Created
January 8, 2017 20:25
-
-
Save gsilvis/f2b85aba60d2b881289be3de2190f633 to your computer and use it in GitHub Desktop.
Regex parsing in rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::env; | |
use std::io; | |
use std::io::BufRead; | |
use std::ops::Deref; | |
#[derive(Debug)] | |
enum AST { | |
Alternation(Vec<AST>), | |
Concatenation(Vec<AST>), | |
Star(Box<AST>), | |
Plus(Box<AST>), | |
Question(Box<AST>), | |
Literal(char), | |
Dot, | |
} | |
#[derive(Debug)] | |
#[derive(Copy)] | |
#[derive(Clone)] | |
enum RegexNodeType { | |
Begin, | |
Success, | |
Literal(char), | |
Dot, | |
} | |
#[derive(Debug)] | |
struct RegexNode { | |
ty: RegexNodeType, | |
next: Vec<usize>, | |
} | |
fn main() { | |
let regex = env::args().nth(1).unwrap(); | |
let regex = parse_regex(&*regex); | |
let regex = compile_regex(®ex); | |
let stdin = io::stdin(); | |
for line in stdin.lock().lines() { | |
let line = line.unwrap(); | |
if run_regex(®ex, &line) { | |
println!("{}", &line); | |
} | |
} | |
} | |
// Run the regex | |
fn check_node(ty: RegexNodeType, | |
c: Option<char>) -> bool { | |
match (ty, c) { | |
(RegexNodeType::Begin, _) => false, | |
(RegexNodeType::Success, None) => true, | |
(RegexNodeType::Success, Some(_)) => false, | |
(RegexNodeType::Literal(_), None) => false, | |
(RegexNodeType::Literal(x), Some(y)) => x == y, | |
(RegexNodeType::Dot, None) => false, | |
(RegexNodeType::Dot, Some(_)) => true, | |
} | |
} | |
fn step_regex(len: usize, | |
regex: &Vec<RegexNode>, | |
state: &Vec<bool>, | |
c: Option<char>) -> Vec<bool> { | |
let mut res = vec![false; len]; | |
for i in 0..len { | |
if state[i] { | |
for j in regex[i].next.iter() { | |
if check_node(regex[*j].ty, c) { | |
res[*j] = true; | |
} | |
} | |
} | |
} | |
res | |
} | |
fn run_regex(regex: &Vec<RegexNode>, string: &str) -> bool { | |
let len = regex.len(); | |
let mut state = vec![false; len]; | |
state[0] = true; | |
let state = string.chars().fold(state, |state, c| step_regex(len, regex, &state, Some(c))); | |
let state = step_regex(len, regex, &state, None); | |
state[1] | |
} | |
// Compile the AST | |
struct SubgraphState { | |
prev: Vec<usize>, | |
next: Vec<usize>, | |
connect_through: bool, | |
} | |
fn compile_regex(ast: &AST) -> Vec<RegexNode> { | |
let mut result = vec![(RegexNode { ty: RegexNodeType::Begin, next: vec![] }), | |
(RegexNode { ty: RegexNodeType::Success, next: vec![] })]; | |
let res = compile_regex_helper(&mut result, ast); | |
for from in res.next.iter() { | |
result[*from].next.push(1); | |
} | |
for to in res.prev.iter() { | |
result[0].next.push(*to); | |
} | |
if res.connect_through { | |
result[0].next.push(1); | |
} | |
return result; | |
} | |
fn compile_regex_helper(result: &mut Vec<RegexNode>, ast: &AST) | |
-> SubgraphState { | |
match ast { | |
&AST::Alternation(ref vec) => { | |
let init = SubgraphState { prev: vec![], next: vec![], connect_through: false }; | |
vec.iter().fold(init, |mut l, ast| { | |
let r = compile_regex_helper(result, ast); | |
l.prev.extend(r.prev); | |
l.next.extend(r.next); | |
l.connect_through = l.connect_through || r.connect_through; | |
l | |
}) | |
} | |
&AST::Concatenation(ref vec) => { | |
let init = SubgraphState { prev: vec![], next: vec![], connect_through: true }; | |
vec.iter().fold(init, |l, ast| { | |
let r = compile_regex_helper(result, ast); | |
for from in l.next.iter() { | |
for to in r.prev.iter() { | |
result[*from].next.push(*to); | |
} | |
} | |
let mut res = SubgraphState { | |
prev: l.prev, | |
next: r.next, | |
connect_through: l.connect_through && r.connect_through, | |
}; | |
if l.connect_through { | |
res.prev.extend(r.prev); | |
} | |
if r.connect_through { | |
res.next.extend(l.next); | |
} | |
res | |
}) | |
} | |
&AST::Star(ref inner) => { | |
let mut res = compile_regex_helper(result, inner.deref()); | |
for from in res.next.iter() { | |
for to in res.prev.iter() { | |
result[*from].next.push(*to); | |
} | |
} | |
if res.connect_through { | |
res.next.extend(res.prev.clone()); | |
res.prev.extend(res.next.clone()); | |
} | |
res.connect_through = true; | |
res | |
} | |
&AST::Plus(ref inner) => { | |
let mut res = compile_regex_helper(result, inner.deref()); | |
for from in res.next.iter() { | |
for to in res.prev.iter() { | |
result[*from].next.push(*to); | |
} | |
} | |
if res.connect_through { | |
res.next.extend(res.prev.clone()); | |
res.prev.extend(res.next.clone()); | |
} | |
res | |
} | |
&AST::Question(ref inner) => { | |
let mut res = compile_regex_helper(result, inner.deref()); | |
res.connect_through = true; | |
res | |
} | |
&AST::Literal(c) => { | |
let i = result.len(); | |
result.push(RegexNode { | |
ty: RegexNodeType::Literal(c), | |
next: vec![], | |
}); | |
SubgraphState { | |
prev: vec![i], | |
next: vec![i], | |
connect_through: false | |
} | |
} | |
&AST::Dot => { | |
let i = result.len(); | |
result.push(RegexNode { | |
ty: RegexNodeType::Dot, | |
next: vec![], | |
}); | |
SubgraphState { | |
prev: vec![i], | |
next: vec![i], | |
connect_through: false | |
} | |
} | |
} | |
} | |
// Parse the regex | |
fn parse_regex(s: &str) -> AST { | |
let mut parens = Vec::<(Vec<AST>, Vec<AST>)>::new(); | |
let mut alters = Vec::<AST>::new(); | |
let mut concats = Vec::<AST>::new(); | |
let mut current = None; | |
for c in s.chars() { | |
let prev = match c { | |
'?' => Some(AST::Question(Box::new(current.expect("? in invalid position")))), | |
'*' => Some(AST::Star(Box::new(current.expect("* in invalid position")))), | |
'+' => Some(AST::Plus(Box::new(current.expect("+ in invalid position")))), | |
_ => current | |
}; | |
if let Some(s) = prev { | |
concats.push(s); | |
} | |
current = match c { | |
'(' => { | |
parens.push((alters, concats)); | |
alters = Vec::new(); | |
concats = Vec::new(); | |
None | |
} | |
')' => { | |
alters.push(AST::Concatenation(concats)); | |
let res = Some(AST::Alternation(alters)); | |
match parens.pop() { | |
Some((a, b)) => { | |
alters = a; | |
concats = b; | |
} | |
None => panic!("Extra )"), | |
} | |
res | |
} | |
'|' => { | |
alters.push(AST::Concatenation(concats)); | |
concats = Vec::<AST>::new(); | |
None | |
} | |
'?' => None, | |
'*' => None, | |
'+' => None, | |
'.' => Some(AST::Dot), | |
_ => Some(AST::Literal(c)), | |
} | |
} | |
if !parens.is_empty() { | |
panic!("Unterminated ("); | |
} | |
if let Some(c) = current { | |
concats.push(c); | |
} | |
alters.push(AST::Concatenation(concats)); | |
AST::Alternation(alters) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment