Created
October 20, 2020 04:34
-
-
Save thomcc/e24c14639cffc5eb00de60f790d75901 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::string::String; | |
use std::str::Chars; | |
use std::iter::{Enumerate, Peekable}; | |
// an xml parser that's decent enough if you don't care about parsing perf | |
// and you completely control the input. | |
#[derive(Clone, Debug)] | |
pub struct XmlNode { | |
pub tag: String, | |
pub attributes: Vec<(String, String)>, | |
pub children: Vec<XmlNode>, | |
pub body: String, | |
} | |
impl XmlNode { | |
pub fn new(tag: String) -> XmlNode { | |
XmlNode { | |
tag: tag, | |
attributes: Vec::new(), | |
children: Vec::new(), | |
body: String::new(), | |
} | |
} | |
pub fn child<'a>(&'a self, child: &str) -> Option<&'a XmlNode> { | |
self.children.iter().find(|c| c.tag == child) | |
} | |
pub fn attr<'a>(&'a self, attr: &str) -> Option<&'a str> { | |
self.attributes.iter().find(|a| a.0 == attr).map(|a| a.1.as_str()) | |
} | |
pub fn set_attr(&mut self, attr: String, value: String) { | |
if let Some(mut cur) = self.attributes.iter_mut().find(|a| a.0 == attr) { | |
cur.1 = value; | |
return; | |
} | |
self.attributes.push((String::from(attr), value)); | |
} | |
pub fn child_or_attr<'a>(&'a self, name: &str) -> Option<&'a str> { | |
self.attr(name).or_else(|| self.child(name).map(|v| v.body.borrow())) | |
} | |
} | |
static WHITESPACE: &'static[char] = &[' ', '\t', '\n', '\r']; | |
static DELIM: &'static[char] = &['<', '>', '!', '?', '=', '/']; | |
#[inline] | |
fn is_one_of(e: char, cs: &[char]) -> bool { | |
cs.iter().any(|&c| c == e) | |
} | |
struct XmlParser<'a> { | |
s: Peekable<Enumerate<Chars<'a>>>, | |
line: usize, | |
} | |
impl<'a> XmlParser<'a> { | |
fn parse(&mut self) -> Result<XmlNode, String> { | |
let mut token = self.next_token(); | |
while token != "<" || is_one_of(self.s.peek().map_or(' ', |&v| v.1), &['!', '?']) { | |
token = self.next_token(); | |
} | |
let mut elem = XmlNode::new(self.next_token()); | |
while self.s.peek().is_some() && { token = self.next_token(); token != ">" && token != "/" } { | |
let attr = token; | |
token = self.next_token(); | |
if token != "=" { | |
return Err(format!("Expected '=' between attr and value in {} node on line {} (saw {}).", elem.tag, self.line, token)); | |
} | |
let value = self.next_token(); | |
elem.set_attr(attr, value); | |
} | |
if token == "/" { | |
token = self.next_token(); | |
if token != ">" { | |
return Err(format!("Expected '>' after '/' in {} node on line {}.", elem.tag, self.line)); | |
} | |
return Ok(elem); | |
} | |
if token != ">" { | |
return Err(format!("Expected '>' to close {} node on line {}.", elem.tag, self.line)); | |
} | |
self.eat_whitespace(); | |
let mut l = self.line; | |
let mut t = self.s.clone(); | |
token = self.next_token(); | |
while token != "<" || self.s.peek().map_or(' ', |v| v.1) != '/' { | |
if token == "<" { | |
self.s = t.clone(); | |
self.line = l; | |
elem.children.push(match self.parse() { | |
Ok(node) => node, | |
Err(s) => return Err(s) | |
}); | |
self.eat_whitespace(); | |
} else { | |
let end_idx = self.s.peek().map(|v| v.0).unwrap_or(0); | |
while let Some(&(i, _)) = t.peek() { | |
if i == end_idx { break; } | |
elem.body.push(t.next().unwrap().1); | |
} | |
while let Some(&(_, c)) = self.s.peek() { | |
if c == '<' { break; } | |
elem.body.push(self.s.next().unwrap().1); | |
} | |
if self.s.peek().is_none() { | |
return Err(format!("Unclosed {} element body on line {}", elem.tag, self.line)); | |
} | |
} | |
t = self.s.clone(); | |
l = self.line; | |
token = self.next_token(); | |
} | |
if token != "<" { | |
Err(format!("Unclosed {} element body on line {}", elem.tag, self.line)) | |
} else if { token = self.next_token(); token != "/" } { | |
Err(format!("Expected '/' in closing tag of {} on line {}", elem.tag, self.line)) | |
} else if { token = self.next_token(); token != elem.tag } { | |
Err(format!("Saw closing tag for {} on line {} when it should have been for {}", token, self.line, elem.tag)) | |
} else if { token = self.next_token(); token != ">" } { | |
Err(format!("Expected '>' in closing tag of {} on line {}", elem.tag, self.line)) | |
} else { | |
let s = elem.body.clone(); | |
elem.body = s.trim().to_string(); | |
Ok(elem) | |
} | |
} | |
fn eat_whitespace(&mut self) { | |
while let Some(&(_, c)) = self.s.peek() { | |
if !is_one_of(c, WHITESPACE) { break; } | |
if c == '\n' { self.line += 1; } | |
self.s.next().unwrap(); | |
} | |
} | |
fn next_token(&mut self) -> String { | |
let mut token = String::new(); | |
while let Some(&(_, c)) = self.s.peek() { | |
if !is_one_of(c, WHITESPACE) { break; } | |
if c == '\n' { self.line += 1; } | |
self.s.next().unwrap(); | |
} | |
let ch = match self.s.next() { Some((_, c)) => c, None => return token }; | |
if ch == '"' { | |
while let Some((_, c)) = self.s.next() { | |
if c == '"' { break; } | |
if c == '\n' { self.line += 1; } | |
token.push(c); | |
} | |
} else { | |
token.push(ch); | |
if !is_one_of(ch, DELIM) { | |
while let Some(&(_, c)) = self.s.peek() { | |
if is_one_of(c, DELIM) || is_one_of(c, WHITESPACE) { | |
break; | |
} | |
token.push(self.s.next().unwrap().1); | |
} | |
self.eat_whitespace(); | |
} | |
} | |
token | |
} | |
} | |
pub fn xml_parse(s: &str) -> Result<XmlNode, String> { | |
let mut parser = XmlParser { | |
line: 0, | |
s: s.chars().enumerate().peekable() | |
}; | |
parser.parse() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment