Created
September 12, 2024 22:42
Parser Monad Combinators in rust for a json parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Parser Monad Combinators | |
/// https://www.cs.nott.ac.uk/~pszgmh/monparsing.pdf | |
//ploki@blackmilk.fr | |
use std::env; | |
use std::fs; | |
use std::ops::{Add, Deref, Shr}; | |
use std::sync::Arc; | |
#[derive(Clone)] | |
struct Interpretation<A>(A, String); | |
#[derive(Clone)] | |
struct VecInterpretation<A> { | |
result: Vec<Interpretation<A>>, | |
} | |
#[derive(Clone)] | |
struct Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
parse: Arc<dyn Fn(String) -> VecInterpretation<A> + 'a>, | |
} | |
struct TParser<'a, A, B> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
t_parse: Arc<dyn Fn(A) -> Parser<'a, B> + 'a>, | |
} | |
impl<A> Add for VecInterpretation<A> | |
where | |
A: Clone, | |
{ | |
type Output = VecInterpretation<A>; | |
fn add(self, other: VecInterpretation<A>) -> VecInterpretation<A> { | |
let mut result = self.result.clone(); | |
result.extend(other.result.clone()); | |
VecInterpretation { result } | |
} | |
} | |
impl<'a, A> Add for Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
type Output = Parser<'a, A>; | |
fn add(self, other: Parser<'a, A>) -> Parser<'a, A> { | |
Parser::new(move |input: String| self(input.clone()) + other(input)) | |
} | |
} | |
impl<'a, A> Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
fn new<F>(f: F) -> Self | |
where | |
F: Fn(String) -> VecInterpretation<A> + 'a, | |
{ | |
Parser { parse: Arc::new(f) } | |
} | |
} | |
// Parser<A> elements are callable with this | |
impl<'a, A> Deref for Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
type Target = dyn Fn(String) -> VecInterpretation<A> + 'a; | |
fn deref(&self) -> &Self::Target { | |
&*self.parse | |
} | |
} | |
impl<'a, A, B> TParser<'a, A, B> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
fn new<F>(f: F) -> Self | |
where | |
F: Fn(A) -> Parser<'a, B> + 'a, | |
{ | |
TParser { | |
t_parse: Arc::new(f), | |
} | |
} | |
} | |
// Make TParsers callable | |
impl<'a, A, B> Deref for TParser<'a, A, B> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
type Target = dyn Fn(A) -> Parser<'a, B> + 'a; | |
fn deref(&self) -> &Self::Target { | |
&*self.t_parse | |
} | |
} | |
// monadic bind. then(parser, tparser) | |
fn then<'a, A, B>(p: Parser<'a, A>, rhs: TParser<'a, A, B>) -> Parser<'a, B> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
Parser::new(move |input: String| { | |
let mut res = VecInterpretation { result: vec![] }; | |
for Interpretation(v, unconsumed) in p(input).result { | |
res = res + rhs(v)(unconsumed); | |
} | |
res | |
}) | |
} | |
// syntactic sugar for monadic bind | |
// parser >> tparser | |
impl<'a, A, B, F> Shr<F> for Parser<'a, A> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
F: Fn(A) -> Parser<'a, B> + 'a, | |
{ | |
type Output = Parser<'a, B>; | |
fn shr(self, f: F) -> Parser<'a, B> { | |
then(self, TParser::new(f)) | |
} | |
} | |
// pure function of the monad | |
fn success<'a, A>(a: A) -> Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
Parser::new(move |input: String| VecInterpretation { | |
result: vec![Interpretation(a.clone(), input)], | |
}) | |
} | |
// discards input and gives nothing | |
fn zero<'a, A>() -> Parser<'a, A> | |
where | |
A: Clone + 'a, | |
{ | |
Parser::new(|_| VecInterpretation { result: vec![] }) | |
} | |
// get an item from the input | |
fn item<'a>() -> Parser<'a, char> { | |
Parser::new(|input: String| { | |
if let Some(c) = input.chars().next() { | |
VecInterpretation { | |
result: vec![Interpretation(c, input.chars().skip(1).collect())], | |
} | |
} else { | |
zero()(input) | |
} | |
}) | |
} | |
fn satisfies<'a, P>(p: P) -> Parser<'a, char> | |
where | |
P: Fn(char) -> bool + Clone + 'a, | |
{ | |
item() >> move |x| if p(x) { success(x) } else { zero() } | |
} | |
fn character<'a>(x: char) -> Parser<'a, char> { | |
satisfies(move |c| c == x) | |
} | |
fn digit<'a>() -> Parser<'a, char> { | |
satisfies(|c| c.is_ascii_digit()) | |
} | |
#[allow(dead_code)] | |
fn lowercase<'a>() -> Parser<'a, char> { | |
satisfies(|c| c.is_lowercase()) | |
} | |
#[allow(dead_code)] | |
fn uppercase<'a>() -> Parser<'a, char> { | |
satisfies(|c| c.is_uppercase()) | |
} | |
#[allow(dead_code)] | |
fn alphanumeric<'a>() -> Parser<'a, char> { | |
satisfies(|c| c.is_alphanumeric()) | |
} | |
#[allow(dead_code)] | |
fn letter<'a>() -> Parser<'a, char> { | |
lowercase() + uppercase() | |
} | |
fn blank<'a>() -> Parser<'a, ()> { | |
many(satisfies(|c| c.is_whitespace())) >> |_| success(()) | |
} | |
fn many<'a, A>(p: Parser<'a, A>) -> Parser<'a, Vec<A>> | |
where | |
A: Clone + 'a, | |
{ | |
many1(p) + success(vec![]) | |
} | |
fn many1<'a, A>(p: Parser<'a, A>) -> Parser<'a, Vec<A>> | |
where | |
A: Clone + 'a, | |
{ | |
p.clone() >> move |x: A| | |
many(p.clone()) >> move |xs: Vec<A>| { | |
let mut xxs: Vec<A> = vec![x.clone()]; | |
xxs.extend(xs.clone()); | |
success(xxs) | |
} | |
} | |
fn many_string(p: Parser<char>) -> Parser<String> { | |
many1_string(p) + success("".to_string()) | |
} | |
fn many1_string(p: Parser<char>) -> Parser<String> { | |
p.clone() >> move |x: char| | |
many(p.clone()) >> move |xs: Vec<char>| { | |
let mut s: String = xs.into_iter().collect(); | |
s.insert(0, x); | |
success(s) | |
} | |
} | |
fn sequence1<'a, A, B>(p: Parser<'a, A>, sep: Parser<'a, B>) -> Parser<'a, Vec<A>> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
p.clone() >> move |x: A| | |
many(sep.clone() >> { | |
let p = p.clone(); | |
move |_| p.clone() >> move |y: A| success(y) | |
}) >> move |xs: Vec<A>| { | |
let mut xxs: Vec<A> = vec![x.clone()]; | |
xxs.extend(xs.clone()); | |
success(xxs) | |
} | |
} | |
fn sequence<'a, A, B>(p: Parser<'a, A>, sep: Parser<'a, B>) -> Parser<'a, Vec<A>> | |
where | |
A: Clone + 'a, | |
B: Clone + 'a, | |
{ | |
sequence1(p, sep) + success(vec![]) | |
} | |
fn block<'a, Opening, Content, Closing>( | |
open: Parser<'a, Opening>, | |
p: Parser<'a, Content>, | |
close: Parser<'a, Closing>, | |
) -> Parser<'a, Content> | |
where | |
Opening: Clone + 'a, | |
Content: Clone + 'a, | |
Closing: Clone + 'a, | |
{ | |
open >> move |_| { | |
let close = close.clone(); | |
p.clone() >> move |x: Content| close.clone() >> move |_| success(x.clone()) | |
} | |
} | |
#[allow(dead_code)] | |
fn natural<'a>() -> Parser<'a, i64> { | |
many1_string(digit()) | |
>> move |xs: String| { | |
let n: i64 = xs.parse().unwrap(); | |
success(n) | |
} | |
} | |
#[allow(dead_code)] | |
fn integer<'a>() -> Parser<'a, i64> { | |
natural() + (character('-') >> move |_| natural() >> move |y: i64| success(-y)) | |
} | |
fn word<'a>(s: String) -> Parser<'a, ()> { | |
Parser::new( | |
move |input: String| | |
if input.starts_with(&s.clone()) { | |
VecInterpretation { | |
result: vec![Interpretation((), input[s.len()..].to_string())], | |
} | |
} else { | |
zero()(input) | |
} | |
) | |
} | |
// JSON Value Types | |
#[derive(Debug, Clone)] | |
#[allow(dead_code)] | |
enum JsonValue { | |
String(String), | |
Number(f64), | |
Object(Vec<(String, JsonValue)>), | |
Array(Vec<JsonValue>), | |
Bool(bool), | |
Null, | |
} | |
// Parsers of JSON Value Types | |
fn json_null<'a>() -> Parser<'a, JsonValue> { | |
word("null".to_string()) >> |_| success(JsonValue::Null) | |
} | |
fn json_bool<'a>() -> Parser<'a, JsonValue> { | |
(word("false".to_string()) >> |_| success(JsonValue::Bool(false))) | |
+ (word("true".to_string()) >> |_| success(JsonValue::Bool(true))) | |
} | |
fn json_number<'a>() -> Parser<'a, JsonValue> { | |
(character('-') + success('+')) >> move |minus_sign: char| | |
many1_string(digit()) >> move |int_part: String| { | |
let integer_part = |minus_sign, int_part| { | |
if minus_sign == '-' { | |
format!("-{}", int_part) | |
} else { | |
int_part | |
} | |
}; | |
((character('.') >> move |_| | |
(many1_string(digit()) >> move |frac_part: String| | |
success(format!(".{}", frac_part)) | |
) + success(String::new()) | |
) + success(String::new())) >> move |frac: String| { | |
let int_part = int_part.clone(); | |
let frac = frac.clone(); | |
( (character('e') + character('E')) >> move |_| { | |
(character('+') + character('-') + success('+')) >> move |sign: char| | |
many1_string(digit()) >> move |exp_part: String| | |
success(format!("e{}{}", sign, exp_part)) | |
} + success(String::new())) >> move |exponent: String| { | |
let number_str = format!("{}{}{}", integer_part(minus_sign, | |
int_part.clone()), | |
frac.clone(), exponent); | |
match number_str.parse::<f64>() { | |
Ok(number) => success(JsonValue::Number(number)), | |
Err(_) => zero(), | |
} | |
} | |
} + (match format!("{}{}", | |
integer_part(minus_sign, | |
int_part.clone()), | |
frac.clone()).parse::<f64>() { | |
Ok(number) => success(JsonValue::Number(number)), | |
Err(_) => zero(), | |
}) | |
} | |
} | |
/// decode \uXXXX | |
fn hex4<'a>() -> Parser<'a, char> { | |
let hex_digit = || satisfies(|c| c.is_ascii_hexdigit()); | |
hex_digit() >> move |d1| | |
hex_digit() >> move |d2| | |
hex_digit() >> move |d3| | |
hex_digit() >> move |d4| | |
u32::from_str_radix(&format!("{}{}{}{}", d1, d2, d3, d4), 16) | |
.ok() | |
.and_then(std::char::from_u32) | |
.map_or_else(zero, success) | |
} | |
fn json_string<'a>() -> Parser<'a, JsonValue> { | |
// Parser to handle escape sequences | |
let escape = character('\\') >> |_| | |
satisfies(|c| "\"\\/bfnrtu".contains(c)) >> move |esc| { | |
let escaped_char = match esc { | |
'"' => '"', | |
'\\' => '\\', | |
'/' => '/', | |
'b' => '\x08', | |
'f' => '\x0C', | |
'n' => '\n', | |
'r' => '\r', | |
't' => '\t', | |
'u' => return hex4(), | |
_ => return zero(), | |
}; | |
success(escaped_char) | |
}; | |
// Parser to handle a regular character or an escape sequence | |
let char_or_escape = escape + satisfies(|c| c != '"' && c != '\\'); | |
block(character('"'), many_string(char_or_escape), character('"')) | |
>> |s: String| success(JsonValue::String(s)) | |
} | |
fn json_array<'a>() -> Parser<'a, JsonValue> { | |
block( | |
character('['), | |
sequence(json_value().clone(), | |
character(',')), | |
character(']'), | |
) >> |values: Vec<JsonValue>| | |
success(JsonValue::Array(values)) | |
} | |
fn json_object<'a>() -> Parser<'a, JsonValue> { | |
let key_value_pair = json_string() | |
>> move |json_key| match json_key { | |
JsonValue::String(key) => { | |
blank() >> move |_| | |
character(':') >> { | |
let key = key.clone(); | |
move |_| json_value() >> { | |
let key = key.clone(); | |
move |value| success((key.clone(), value)) | |
} | |
} | |
} | |
_ => zero(), | |
}; | |
block( | |
character('{') >> |_| blank(), | |
sequence(key_value_pair, character(',') >> |_| blank()), | |
character('}'), | |
) >> |pairs: Vec<(String, JsonValue)>| | |
success(JsonValue::Object(pairs)) | |
} | |
fn json_value<'a>() -> Parser<'a, JsonValue> { | |
blank() >> |_| | |
( json_null() | |
+ json_bool() | |
+ json_number() | |
+ json_string() | |
+ json_array() | |
+ json_object()) >> |v| | |
blank() >> move |_| | |
success(v.clone()) | |
} | |
fn main() { | |
let json_parse = json_value(); | |
let json_payload = if let Some(filename) = env::args().nth(1) { | |
fs::read_to_string(filename) | |
} else { | |
Ok(r###"{ | |
"Hello," : "Wolrd!", | |
"number": -123.45, | |
"number2": -123.45e85, | |
"number3": -123e85, | |
"number4": -123.E85, | |
"bool": true, | |
"array": [1 , 2, 3, "m\"eh\u0111 31\u00f8C", false], | |
"null": null, | |
"baz": { | |
"a": true, | |
"b": false | |
}, | |
"plop": {}, | |
"gr8k": [] | |
}"###.to_string()) | |
}; | |
match json_payload { | |
Ok(payload) => { | |
let rs = json_parse(payload); | |
for Interpretation(c, s) in rs.result { | |
println!("{:?} -> input remainder \"{}\"", c, s); | |
} | |
} | |
Err(err) => println!("Err: {}", err), | |
} | |
println!("END"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment