Skip to content

Instantly share code, notes, and snippets.

@sethhall
Last active July 7, 2024 18:37
Show Gist options
  • Save sethhall/386c941a0f778d8b79be03c7fbfd47d0 to your computer and use it in GitHub Desktop.
Save sethhall/386c941a0f778d8b79be03c7fbfd47d0 to your computer and use it in GitHub Desktop.
JSON parser in Spicy
module JSON;
import spicy;
# This supports jsonc (json with comments)
%skip = /[ \t\r\n]*(\/\/[^\n]*)*[ \t\r\n]*/;
public type File = unit {
values: JSONValue[];
};
type JSONValue = unit {
switch {
-> obj : JSONObject;
-> arr : JSONArray;
-> str : JSONString;
-> bol : JSONBool;
-> nul : JSONNull;
-> num : JSONNumber;
};
};
type JSONObject = unit {
: /\{/;
fields: JSONPair[];
: /\}/;
} &convert=json_object_to_map($$);
type JSONPair = unit {
key: JSONString;
: skip /:/;
value: JSONValue;
: skip /,?/;
};
type JSONArrayElement = unit {
value: JSONValue;
: skip /,?/;
} &convert=self.value;
type JSONArray = unit {
: skip b"[";
values: JSONArrayElement[];
: skip b"]";
} &convert=self.values;
type JSONString = unit {
: /\"/;
value: uint8[] &until=($$ == 34 && (|self.value| == 0 || self.value.back() != 92));
} &convert=vec_to_str(self.value);
type JSONBool = unit {
value: /false|true/;
} &convert=str_to_bool(self.value);
type JSONNull = unit {
value: /null/;
} &convert=get_null();
type JSONNumber = unit {
value: /-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?/ &nosub;
} &convert=self.value.to_real();
#######################
## Utility Functions ##
#######################
function get_null(): optional<bool> {
# I couldn't figure out a better way to return a void value, but this works at least.
local x: optional<bool>;
return x;
}
function json_object_to_map(jo: JSONObject): map<string, JSONValue> {
local m: map<string, JSONValue>;
for ( elem in jo.fields ) {
m[elem.key] = elem.value;
}
return m;
}
function vec_to_str(vec: vector<uint8>): string {
local out = b"";
local escaping = False;
local reading_unicode_val = 0;
local unicode_vals: vector<uint8>&;
for ( i in vec ) {
local x = i;
if ( reading_unicode_val > 0 ) {
--reading_unicode_val;
unicode_vals.push_back(x);
if ( reading_unicode_val == 0 ) {
# TODO: turn the unicode_vals vector into a real unicode character... but how?
# right now this just packs the replacement character "�"
# I still don't even fully understand why unicode has escaping like this...
out += 0xEF;
out += 0xBF;
out += 0xBD;
}
continue;
} else if ( ! escaping && x == 92 ) {
# Begin handling backslash escaping
escaping = True;
continue;
} else if ( escaping ) {
escaping = False;
switch ( x ) {
case 0x62: {
# "b" - backspace
x = 0x08;
}
case 0x66: {
# "f" - formfeed
x = 0x0C;
}
case 0x6E: {
# "n" - newline
x = 0x0A;
}
case 0x72: {
# "r" - carriage return
x = 0x0D;
}
case 0x74: {
# "t" - horizontal tab
x = 0x09;
}
case 0x75: {
# "u" - unicode escape begin
# Read the next 4 bytes
reading_unicode_val = 4;
unicode_vals = new vector<uint8>(4);
continue;
}
default: {
# do nothing and pass the value straight thru.
x = x;
}
}
}
out += pack(x, spicy::ByteOrder::Network);
}
return out.decode();
}
function str_to_bool(str: bytes): bool {
switch ( str ) {
case b"false": return False;
case b"true": return True;
default: assert False : "Something neither true nor false was fed to 'to_bool'";
}
}
@bbannier
Copy link

@sethhall, do you have a compact reproducer of parser+input for this? I could imagine that lookahead parsing with possibly empty matches might have weird edge cases (though e.g., it seems your parser should always consume some input).

@sethhall
Copy link
Author

Sorry. Don't have a minimal reproducer yet but I'll try to make one today.

@sethhall
Copy link
Author

I just published another set of changes after chatting with Robin. Now the parser fully works correctly but we did identify a bug in the lookahead parsing. I'll still try today to get around to creating a minimal reproducer. (it's something about multiple regex fields in a structure after a look ahead with the %skip option set).

@sethhall
Copy link
Author

sethhall commented Jul 6, 2024

Ok, just published some more changes to this. String escaping is working now (except for the \uXXXX thing they do in json, that's a bit more complicated and I don't feel like working on it).

I also updated it to use spicy with the bug fix that just went in regarding multiple regular expressions in a row. Also using the new builtin to_real function on byte arrays.

@sethhall
Copy link
Author

sethhall commented Jul 7, 2024

And now another change set that makes this support jsonc (json with comments)

@sethhall
Copy link
Author

sethhall commented Jul 7, 2024

Fixed an issue with vector handling during string parsing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment