Skip to content

Instantly share code, notes, and snippets.

@gsilvis
Created January 8, 2017 20:25
Show Gist options
  • Save gsilvis/f2b85aba60d2b881289be3de2190f633 to your computer and use it in GitHub Desktop.
Save gsilvis/f2b85aba60d2b881289be3de2190f633 to your computer and use it in GitHub Desktop.
Regex parsing in rust
use std::env;
use std::io;
use std::io::BufRead;
use std::ops::Deref;
#[derive(Debug)]
enum AST {
Alternation(Vec<AST>),
Concatenation(Vec<AST>),
Star(Box<AST>),
Plus(Box<AST>),
Question(Box<AST>),
Literal(char),
Dot,
}
#[derive(Debug)]
#[derive(Copy)]
#[derive(Clone)]
enum RegexNodeType {
Begin,
Success,
Literal(char),
Dot,
}
#[derive(Debug)]
struct RegexNode {
ty: RegexNodeType,
next: Vec<usize>,
}
fn main() {
let regex = env::args().nth(1).unwrap();
let regex = parse_regex(&*regex);
let regex = compile_regex(&regex);
let stdin = io::stdin();
for line in stdin.lock().lines() {
let line = line.unwrap();
if run_regex(&regex, &line) {
println!("{}", &line);
}
}
}
// Run the regex
fn check_node(ty: RegexNodeType,
c: Option<char>) -> bool {
match (ty, c) {
(RegexNodeType::Begin, _) => false,
(RegexNodeType::Success, None) => true,
(RegexNodeType::Success, Some(_)) => false,
(RegexNodeType::Literal(_), None) => false,
(RegexNodeType::Literal(x), Some(y)) => x == y,
(RegexNodeType::Dot, None) => false,
(RegexNodeType::Dot, Some(_)) => true,
}
}
fn step_regex(len: usize,
regex: &Vec<RegexNode>,
state: &Vec<bool>,
c: Option<char>) -> Vec<bool> {
let mut res = vec![false; len];
for i in 0..len {
if state[i] {
for j in regex[i].next.iter() {
if check_node(regex[*j].ty, c) {
res[*j] = true;
}
}
}
}
res
}
fn run_regex(regex: &Vec<RegexNode>, string: &str) -> bool {
let len = regex.len();
let mut state = vec![false; len];
state[0] = true;
let state = string.chars().fold(state, |state, c| step_regex(len, regex, &state, Some(c)));
let state = step_regex(len, regex, &state, None);
state[1]
}
// Compile the AST
struct SubgraphState {
prev: Vec<usize>,
next: Vec<usize>,
connect_through: bool,
}
fn compile_regex(ast: &AST) -> Vec<RegexNode> {
let mut result = vec![(RegexNode { ty: RegexNodeType::Begin, next: vec![] }),
(RegexNode { ty: RegexNodeType::Success, next: vec![] })];
let res = compile_regex_helper(&mut result, ast);
for from in res.next.iter() {
result[*from].next.push(1);
}
for to in res.prev.iter() {
result[0].next.push(*to);
}
if res.connect_through {
result[0].next.push(1);
}
return result;
}
fn compile_regex_helper(result: &mut Vec<RegexNode>, ast: &AST)
-> SubgraphState {
match ast {
&AST::Alternation(ref vec) => {
let init = SubgraphState { prev: vec![], next: vec![], connect_through: false };
vec.iter().fold(init, |mut l, ast| {
let r = compile_regex_helper(result, ast);
l.prev.extend(r.prev);
l.next.extend(r.next);
l.connect_through = l.connect_through || r.connect_through;
l
})
}
&AST::Concatenation(ref vec) => {
let init = SubgraphState { prev: vec![], next: vec![], connect_through: true };
vec.iter().fold(init, |l, ast| {
let r = compile_regex_helper(result, ast);
for from in l.next.iter() {
for to in r.prev.iter() {
result[*from].next.push(*to);
}
}
let mut res = SubgraphState {
prev: l.prev,
next: r.next,
connect_through: l.connect_through && r.connect_through,
};
if l.connect_through {
res.prev.extend(r.prev);
}
if r.connect_through {
res.next.extend(l.next);
}
res
})
}
&AST::Star(ref inner) => {
let mut res = compile_regex_helper(result, inner.deref());
for from in res.next.iter() {
for to in res.prev.iter() {
result[*from].next.push(*to);
}
}
if res.connect_through {
res.next.extend(res.prev.clone());
res.prev.extend(res.next.clone());
}
res.connect_through = true;
res
}
&AST::Plus(ref inner) => {
let mut res = compile_regex_helper(result, inner.deref());
for from in res.next.iter() {
for to in res.prev.iter() {
result[*from].next.push(*to);
}
}
if res.connect_through {
res.next.extend(res.prev.clone());
res.prev.extend(res.next.clone());
}
res
}
&AST::Question(ref inner) => {
let mut res = compile_regex_helper(result, inner.deref());
res.connect_through = true;
res
}
&AST::Literal(c) => {
let i = result.len();
result.push(RegexNode {
ty: RegexNodeType::Literal(c),
next: vec![],
});
SubgraphState {
prev: vec![i],
next: vec![i],
connect_through: false
}
}
&AST::Dot => {
let i = result.len();
result.push(RegexNode {
ty: RegexNodeType::Dot,
next: vec![],
});
SubgraphState {
prev: vec![i],
next: vec![i],
connect_through: false
}
}
}
}
// Parse the regex
fn parse_regex(s: &str) -> AST {
let mut parens = Vec::<(Vec<AST>, Vec<AST>)>::new();
let mut alters = Vec::<AST>::new();
let mut concats = Vec::<AST>::new();
let mut current = None;
for c in s.chars() {
let prev = match c {
'?' => Some(AST::Question(Box::new(current.expect("? in invalid position")))),
'*' => Some(AST::Star(Box::new(current.expect("* in invalid position")))),
'+' => Some(AST::Plus(Box::new(current.expect("+ in invalid position")))),
_ => current
};
if let Some(s) = prev {
concats.push(s);
}
current = match c {
'(' => {
parens.push((alters, concats));
alters = Vec::new();
concats = Vec::new();
None
}
')' => {
alters.push(AST::Concatenation(concats));
let res = Some(AST::Alternation(alters));
match parens.pop() {
Some((a, b)) => {
alters = a;
concats = b;
}
None => panic!("Extra )"),
}
res
}
'|' => {
alters.push(AST::Concatenation(concats));
concats = Vec::<AST>::new();
None
}
'?' => None,
'*' => None,
'+' => None,
'.' => Some(AST::Dot),
_ => Some(AST::Literal(c)),
}
}
if !parens.is_empty() {
panic!("Unterminated (");
}
if let Some(c) = current {
concats.push(c);
}
alters.push(AST::Concatenation(concats));
AST::Alternation(alters)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment