Skip to content

Instantly share code, notes, and snippets.

@tyru
Last active August 25, 2019 09:24
Show Gist options
  • Save tyru/b8fcf210bd84da3cf26e5d850ea57029 to your computer and use it in GitHub Desktop.
Save tyru/b8fcf210bd84da3cf26e5d850ea57029 to your computer and use it in GitHub Desktop.
JSON Parser in Rust
#![feature(slice_patterns)]
use std::io;
use std::fmt;
use std::str;
use std::result;
use std::iter::Iterator;
// TODO: take deparse option from command-line arguments
fn main() {
let mut input = String::new();
io::stdin().read_line(&mut input)
.expect("Failed to read json");
let result =
tokenize(&input)
.and_then(|tokens| parse(&tokens[..]))
.map(|json| deparse(&json, DEFAULT_DEPARSE_OPTION, 0));
match result {
Ok(output) => println!("{}", output),
Err(msg) => println!("error: {}", msg),
};
}
#[derive(Clone, Debug)]
enum Token {
Number(f64),
String(std::string::String),
True,
False,
Null,
Colon,
Comma,
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
}
// TODO: https://stackoverflow.com/a/50334049/6321251
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", match self {
Token::Number(f) => ["number(", f.to_string().as_str(), ")"].concat(),
Token::String(s) => ["string(", s, ")"].concat(),
Token::True => "true".to_string(),
Token::False => "false".to_string(),
Token::Null => "null".to_string(),
Token::Colon => "colon".to_string(),
Token::Comma => "comma".to_string(),
Token::LeftBrace => "left brace".to_string(),
Token::RightBrace => "right brace".to_string(),
Token::LeftBracket => "left bracket".to_string(),
Token::RightBracket => "right bracket".to_string(),
})
}
}
impl Eq for Token {}
impl PartialEq for Token {
fn eq(&self, other: &Token) -> bool {
// FIXME: for Token::Number
self.to_string() == other.to_string()
}
}
// I think lexer is not necessary for JSON...
// but this is practice :)
type TokenizeResult<T> = result::Result<T, String>;
fn tokenize(input: &str) -> TokenizeResult<Vec<Token>> {
let bytes: Vec<char> = input.chars().collect();
let mut pos = 0;
let mut tokens = Vec::new();
let end = bytes.len();
while pos < end {
match bytes.get(pos..end) {
Some(&[c, ..]) if c == '\u{0020}' || c == '\u{000D}' || c == '\u{000A}' || c == '\u{0009}' =>
pos += 1,
Some(&[':', ..]) => {
tokens.push(Token::Colon);
pos += 1
},
Some(&[',', ..]) => {
tokens.push(Token::Comma);
pos += 1
},
Some(&['{', ..]) => {
tokens.push(Token::LeftBrace);
pos += 1
},
Some(&['}', ..]) => {
tokens.push(Token::RightBrace);
pos += 1
},
Some(&['[', ..]) => {
tokens.push(Token::LeftBracket);
pos += 1
},
Some(&[']', ..]) => {
tokens.push(Token::RightBracket);
pos += 1
},
Some(&[d, ..]) if is_digit(d) || d == '-' =>
match tokenize_number(&bytes, &mut pos) {
Ok(tok) => tokens.push(tok),
Err(msg) => return Err(msg),
},
Some(&['"', ..]) =>
match tokenize_string(&bytes, &input, &mut pos) {
Ok(tok) => tokens.push(tok),
Err(msg) => return Err(msg),
},
Some(&['t', 'r', 'u', 'e', ..]) => {
tokens.push(Token::True);
pos += 4
},
Some(&['f', 'a', 'l', 's', 'e', ..]) => {
tokens.push(Token::False);
pos += 5
},
Some(&['n', 'u', 'l', 'l', ..]) => {
tokens.push(Token::Null);
pos += 4
},
// TODO: show also the text at error
_ =>
return Err(format!("tokenizer: unknown token at pos {}", pos + 1)),
}
}
Ok(tokens)
}
fn is_digit(c: char) -> bool {
'0' <= c && c <= '9'
}
fn tokenize_number(bytes: &Vec<char>, pos: &mut usize) -> result::Result<Token, String> {
let begin = *pos;
// integer
if let Some(&['-']) = bytes.get(*pos..*pos+1) {
*pos += 1
}
let digit_begin = *pos;
while let Some(&[d]) = bytes.get(*pos..*pos+1) {
if !is_digit(d) {
break
}
*pos += 1
}
if *pos - digit_begin > 1 {
let digits: String = bytes.get(begin..*pos).unwrap_or(&[]).into_iter().collect();
if let Some(&['0']) = bytes.get(digit_begin..digit_begin+1) {
return Err(format!("invalid number syntax: the number of 2 or more digits must not start with zero: {}", digits))
}
}
// fraction
if let Some(&['.']) = bytes.get(*pos..*pos+1) {
*pos += 1;
while let Some(&[d]) = bytes.get(*pos..*pos+1) {
if !is_digit(d) {
break
}
*pos += 1
}
}
// exponent
if let Some(&['E']) | Some(&['e']) = bytes.get(*pos..*pos+1) {
*pos += 1;
if let Some(&['+']) | Some(&['-']) = bytes.get(*pos..*pos+1) {
*pos += 1;
}
while let Some(&[d]) = bytes.get(*pos..*pos+1) {
if !is_digit(d) {
break
}
*pos += 1
}
}
let digits: String = bytes.get(begin..*pos).unwrap_or(&[]).into_iter().collect();
if let Ok(n) = digits.parse() {
Ok(Token::Number(n))
} else {
unreachable!()
}
}
fn tokenize_string(bytes: &Vec<char>, input: &str, pos: &mut usize) -> result::Result<Token, String> {
let begin = *pos;
*pos += 1;
let end = bytes.len();
while *pos < end {
match bytes.get(*pos..end) {
Some(&['"', ..]) => {
*pos += 1;
break
},
Some(&['\\', '"', ..]) => *pos += 2,
Some(&['\\', '\\', ..]) => *pos += 2,
Some(&['\\', '/', ..]) => *pos += 2,
Some(&['\\', 'b', ..]) => *pos += 2,
Some(&['\\', 'f', ..]) => *pos += 2,
Some(&['\\', 'n', ..]) => *pos += 2,
Some(&['\\', 'r', ..]) => *pos += 2,
Some(&['\\', 't', ..]) => *pos += 2,
Some(&['\\', 'u', h1, h2, h3, h4, ..]) if is_hex(h1) && is_hex(h2) && is_hex(h3) && is_hex(h4) => *pos += 6,
Some(&[c, ..]) if '\u{0020}' <= c && c <= '\u{10ffff}' && c != '"' && c != '\\' => *pos += 1,
_ => return Err(format!("tokenizer: unexpected EOF inside double quote at {}", pos)),
};
}
if let Some(s) = input.get(begin..*pos) {
Ok(Token::String(s.to_string()))
} else {
unreachable!()
}
}
fn is_hex(c: char) -> bool {
('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f')
}
#[derive(Clone)]
enum JSON {
Object(Vec<(JSON, JSON)>),
Array(Vec<JSON>),
Number(f64),
String(std::string::String),
True,
False,
Null,
}
impl fmt::Display for JSON {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", deparse(self, DEFAULT_DEPARSE_OPTION, 0))
}
}
type ParseResult<T> = result::Result<T, String>;
fn parse(tokens: &[Token]) -> ParseResult<JSON> {
let (result, rest) = do_parse(tokens);
if let Err(_) = result {
return result
}
if !rest.is_empty() {
// TODO
// return Err(format!("parser: unexpected tokens after {} at {}"))
return Err(format!("parser: unexpected tokens: {:?}", rest))
}
result
}
fn do_parse(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) {
if tokens.is_empty() {
return (Err("no json input".to_string()), tokens)
}
match &tokens[0] {
Token::LeftBrace => parse_object(&tokens[1..]),
Token::LeftBracket => parse_array(&tokens[1..]),
Token::Number(n) => (Ok(JSON::Number(*n)), &tokens[1..]),
Token::String(s) => (Ok(JSON::String(s.to_string())), &tokens[1..]),
Token::True => (Ok(JSON::True), &tokens[1..]),
Token::False => (Ok(JSON::False), &tokens[1..]),
Token::Null => (Ok(JSON::Null), &tokens[1..]),
token => (Err(format!("unknown token {}", token)), tokens),
}
}
fn parse_object(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) {
let mut pairs: Vec<(JSON, JSON)> = Vec::new();
match parse_object_rec(tokens, &mut pairs) {
Ok(rest) => (Ok(JSON::Object(pairs)), rest),
Err(msg) => (Err(msg), tokens),
}
}
fn parse_object_rec<'a, 'b>(tokens: &'a [Token], pairs: &'b mut Vec<(JSON, JSON)>) -> ParseResult<&'a [Token]> {
match tokens {
[] =>
return Err("unexpected EOF while parsing object".to_string()),
[Token::RightBrace, ..] =>
return Ok(&tokens[1..]),
[ref key @ Token::String(_), Token::Colon, ..] => {
let key_result = parse(&[key.clone()]);
let (value_result, rest) = do_parse(&tokens[2..]);
match [key_result, value_result] {
[Ok(k), Ok(v)] => pairs.push((k, v)),
[Err(msg), _] | [_, Err(msg)] => return Err(msg),
};
match rest {
[Token::RightBrace, ..] =>
Ok(&rest[1..]),
[Token::Comma, tok, ..] =>
if *tok == Token::RightBrace {
Err("invalid object syntax: got right brace after comma".to_string())
} else {
parse_object_rec(&rest[1..], pairs)
},
[] =>
Err("invalid object syntax: expected right brace or comma but got EOF".to_string()),
_ =>
Err(format!("invalid object syntax: expected right brace or comma but got {}", rest[0])), // TODO: more readable message
}
},
_ =>
Err("invalid object syntax: key string and colon are expected".to_string()) // TODO: more readable message
}
}
fn parse_array(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) {
let mut elems: Vec<JSON> = Vec::new();
match parse_array_rec(tokens, &mut elems) {
Ok(rest) => (Ok(JSON::Array(elems)), rest),
Err(msg) => (Err(msg), tokens),
}
}
fn parse_array_rec<'a, 'b>(tokens: &'a [Token], elems: &'b mut Vec<JSON>) -> ParseResult<&'a [Token]> {
match tokens {
[] =>
Err("unexpected EOF while parsing array".to_string()),
[Token::RightBracket, ..] =>
Ok(&tokens[1..]),
_ => {
let (result, rest) = do_parse(tokens);
match result {
Ok(v) => elems.push(v),
Err(msg) => return Err(msg),
};
match rest {
[Token::RightBracket, ..] =>
Ok(&rest[1..]),
[Token::Comma, tok, ..] =>
if *tok == Token::RightBracket {
Err("invalid array syntax: got right bracket after comma".to_string())
} else {
parse_array_rec(&rest[1..], elems)
},
[] =>
Err("invalid array syntax: expected right bracket or comma but got EOF".to_string()), // TODO: more readable message
_ =>
Err(format!("invalid array syntax: expected right bracket or comma but got {}", rest[0])), // TODO: more readable message
}
},
}
}
const DEFAULT_DEPARSE_OPTION: &DeparseOption = &DeparseOption {
left_brace: "{",
right_brace: "}",
left_bracket: "[",
right_bracket: "]",
indent: " ",
colon: ": ",
comma: ",",
newline: "\n",
};
struct DeparseOption<'a> {
left_brace: &'a str,
right_brace: &'a str,
left_bracket: &'a str,
right_bracket: &'a str,
indent: &'a str,
colon: &'a str,
comma: &'a str,
newline: &'a str,
}
fn deparse(json: &JSON, option: &DeparseOption, nest: usize) -> String {
match json {
JSON::Object(pairs) =>
[option.left_brace.to_string(),
option.newline.to_string(),
pairs.iter()
.map(|(k, v)|
[
option.indent.to_string().repeat(nest + 1),
deparse(k, option, nest + 1),
option.colon.to_string(),
deparse(v, option, nest + 1)
].concat()
)
.collect::<Vec<String>>()
.join([option.comma, option.newline].concat().as_str()),
option.newline.to_string(),
option.indent.to_string().repeat(nest),
option.right_brace.to_string()].concat(),
JSON::Array(elems) =>
[option.left_bracket.to_string(),
option.newline.to_string(),
elems.iter()
.map(|v|
[
option.indent.to_string().repeat(nest + 1),
deparse(v, option, nest + 1)
].concat()
)
.collect::<Vec<String>>()
.join([option.comma, option.newline].concat().as_str()),
option.newline.to_string(),
option.indent.to_string().repeat(nest),
option.right_bracket.to_string()].concat(),
JSON::Number(n) =>
n.to_string(),
JSON::String(s) =>
s.to_string(),
JSON::True =>
"true".to_string(),
JSON::False =>
"false".to_string(),
JSON::Null =>
"null".to_string(),
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment