Skip to content

Instantly share code, notes, and snippets.

@thomcc
Created October 20, 2020 04:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomcc/e24c14639cffc5eb00de60f790d75901 to your computer and use it in GitHub Desktop.
Save thomcc/e24c14639cffc5eb00de60f790d75901 to your computer and use it in GitHub Desktop.
use std::string::String;
use std::str::Chars;
use std::iter::{Enumerate, Peekable};
// an xml parser that's decent enough if you don't care about parsing perf
// and you completely control the input.
#[derive(Clone, Debug)]
pub struct XmlNode {
pub tag: String,
pub attributes: Vec<(String, String)>,
pub children: Vec<XmlNode>,
pub body: String,
}
impl XmlNode {
pub fn new(tag: String) -> XmlNode {
XmlNode {
tag: tag,
attributes: Vec::new(),
children: Vec::new(),
body: String::new(),
}
}
pub fn child<'a>(&'a self, child: &str) -> Option<&'a XmlNode> {
self.children.iter().find(|c| c.tag == child)
}
pub fn attr<'a>(&'a self, attr: &str) -> Option<&'a str> {
self.attributes.iter().find(|a| a.0 == attr).map(|a| a.1.as_str())
}
pub fn set_attr(&mut self, attr: String, value: String) {
if let Some(mut cur) = self.attributes.iter_mut().find(|a| a.0 == attr) {
cur.1 = value;
return;
}
self.attributes.push((String::from(attr), value));
}
pub fn child_or_attr<'a>(&'a self, name: &str) -> Option<&'a str> {
self.attr(name).or_else(|| self.child(name).map(|v| v.body.borrow()))
}
}
static WHITESPACE: &'static[char] = &[' ', '\t', '\n', '\r'];
static DELIM: &'static[char] = &['<', '>', '!', '?', '=', '/'];
#[inline]
fn is_one_of(e: char, cs: &[char]) -> bool {
cs.iter().any(|&c| c == e)
}
struct XmlParser<'a> {
s: Peekable<Enumerate<Chars<'a>>>,
line: usize,
}
impl<'a> XmlParser<'a> {
fn parse(&mut self) -> Result<XmlNode, String> {
let mut token = self.next_token();
while token != "<" || is_one_of(self.s.peek().map_or(' ', |&v| v.1), &['!', '?']) {
token = self.next_token();
}
let mut elem = XmlNode::new(self.next_token());
while self.s.peek().is_some() && { token = self.next_token(); token != ">" && token != "/" } {
let attr = token;
token = self.next_token();
if token != "=" {
return Err(format!("Expected '=' between attr and value in {} node on line {} (saw {}).", elem.tag, self.line, token));
}
let value = self.next_token();
elem.set_attr(attr, value);
}
if token == "/" {
token = self.next_token();
if token != ">" {
return Err(format!("Expected '>' after '/' in {} node on line {}.", elem.tag, self.line));
}
return Ok(elem);
}
if token != ">" {
return Err(format!("Expected '>' to close {} node on line {}.", elem.tag, self.line));
}
self.eat_whitespace();
let mut l = self.line;
let mut t = self.s.clone();
token = self.next_token();
while token != "<" || self.s.peek().map_or(' ', |v| v.1) != '/' {
if token == "<" {
self.s = t.clone();
self.line = l;
elem.children.push(match self.parse() {
Ok(node) => node,
Err(s) => return Err(s)
});
self.eat_whitespace();
} else {
let end_idx = self.s.peek().map(|v| v.0).unwrap_or(0);
while let Some(&(i, _)) = t.peek() {
if i == end_idx { break; }
elem.body.push(t.next().unwrap().1);
}
while let Some(&(_, c)) = self.s.peek() {
if c == '<' { break; }
elem.body.push(self.s.next().unwrap().1);
}
if self.s.peek().is_none() {
return Err(format!("Unclosed {} element body on line {}", elem.tag, self.line));
}
}
t = self.s.clone();
l = self.line;
token = self.next_token();
}
if token != "<" {
Err(format!("Unclosed {} element body on line {}", elem.tag, self.line))
} else if { token = self.next_token(); token != "/" } {
Err(format!("Expected '/' in closing tag of {} on line {}", elem.tag, self.line))
} else if { token = self.next_token(); token != elem.tag } {
Err(format!("Saw closing tag for {} on line {} when it should have been for {}", token, self.line, elem.tag))
} else if { token = self.next_token(); token != ">" } {
Err(format!("Expected '>' in closing tag of {} on line {}", elem.tag, self.line))
} else {
let s = elem.body.clone();
elem.body = s.trim().to_string();
Ok(elem)
}
}
fn eat_whitespace(&mut self) {
while let Some(&(_, c)) = self.s.peek() {
if !is_one_of(c, WHITESPACE) { break; }
if c == '\n' { self.line += 1; }
self.s.next().unwrap();
}
}
fn next_token(&mut self) -> String {
let mut token = String::new();
while let Some(&(_, c)) = self.s.peek() {
if !is_one_of(c, WHITESPACE) { break; }
if c == '\n' { self.line += 1; }
self.s.next().unwrap();
}
let ch = match self.s.next() { Some((_, c)) => c, None => return token };
if ch == '"' {
while let Some((_, c)) = self.s.next() {
if c == '"' { break; }
if c == '\n' { self.line += 1; }
token.push(c);
}
} else {
token.push(ch);
if !is_one_of(ch, DELIM) {
while let Some(&(_, c)) = self.s.peek() {
if is_one_of(c, DELIM) || is_one_of(c, WHITESPACE) {
break;
}
token.push(self.s.next().unwrap().1);
}
self.eat_whitespace();
}
}
token
}
}
pub fn xml_parse(s: &str) -> Result<XmlNode, String> {
let mut parser = XmlParser {
line: 0,
s: s.chars().enumerate().peekable()
};
parser.parse()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment