lutter/00-digest.md

## 00-digest.md

      
    Raw
  

              00-digest.md
            
          
    Digesting JSON data

This is a description of how we can compute a Digest of the subset of JSON
data that we care about.
The JSON subset puts the following restrictions on JSON values:

the names of object properties must be printable ASCII characters (i.e.,
UTF-8 code points 0x20 ' ' through 0x7e '~')
JSON numbers must be representable as 32 bit signed integers

We assume an accumulating digester, as the ones provided by the
digest crate in
Rust. These digesters expect their input as a byte array. For each JSON
construct, we therefore need to explain how that is converted into a byte
array.
To finalize the specification, we need to settle on a specific digest
algorithm, e.g., SHA256 or SHA512
Primitive JSON types are converted into a byte array in the following way:

string: the byte sequence of the UTF-8 encoding of the string, enclosed
in ". The characters " and \\ are escaped by prefixing them with a
\\
number: i followed by the byte sequence of the little-endian signed 64bit
representation of the number
bool: f for false and t for true
null: n

JSON arrays are converted into a byte sequence by starting with a [,
followed by the byte sequences for each array entry seperated by ,, and
closed with a ]
JSON objects/maps are converted into a byte sequence by starting with a
{, then for each key/value pair in the map in ascending order by key,
appending the byte array for the key, a :, and the byte array for the
value; the digests for key/value pairs are separated by , and the digest
of the entire map is finished with a }
As an example, the JSON object
{
  "id": "thing",
  "count": 12,
  "children": ["c1", "c2", "c3"]
}
would get digested as the bytes corresponding to the ASCII values in the
following string (using \xnm as the hexadecimal notation of a single byte)
{"children":["c1","c2","c3"],"count":i\x00\x00\x00\x00\x00\x00\x00\x0c,"id":"thing"}

The Augeas lens proves that there are no two JSON values that
map to the same byte sequence up to 7 levels of nesting of arrays and maps.
JSON Representation of Graph-specific Values

The GraphQL type system for the Graph uses a few types that can not be
represented as JSON, and are instead represented as strings. The conversion
of these values to JSON strings is as follows:

Bytes: 0x followed by the hexadecimal notation of the bytes in the
value, using only the lowercase hex characters a-f
BigInt: representation as a base-10 integer without leading zeroes
BigDecimal: represented as the integer part plus a . plus the
fractional part in base 10. If there is no integral part, use 0. If there is
no fractional part, omit the ..

TODO: Define the representation of BigInt and BigDecimal in
exponential notation to avoid huge strings of 0 in them (mostly following
what we do today)
Transmitting of digest

When we transmit a digest, we do it in the form scheme:digest:value where
the scheme describes how we arrived at the representation of the data
(let's call the description in this document graph-1), the digest
describes how a digest was computed from the byte representation,
e.g. sha256, and value is the actual digest value, e.g. 2b1e2b...0bfd
so that the complete digest value would be graph-1:sha256:2b1e2b...0bfd
Including the encoding scheme and the digest algorithm makes it possible to
update our JSON digest algorithm in the network by introducing new schemes.
Open Questions


Can we base string encoding on UTF-8? This draft
spec
seems to imply that that is not good for JavaScript clients, and we might
have to use UTF-16 instead


## digest.aug
module Digest =

(* Any byte *)
let any = /.|\n/

(* A string is any byte, enclosed in double quotes, except that the
   double quote and backslash characters need to be escaped with
   a backslash *)
let str =
  let str_char = /[^"\]|\\\\["\]/ in
  "\"" . str_char* . "\""
let _ = print_regexp str; print_endline ""

let number = "i" . any . any . any . any . any . any . any . any
let bool = /[tf]/
let null = "n"

(* Helper to handle our individual marker characters *)
let char(s:string) = del s s
let lbrack = char "["
let rbrack = char "]"
let lbrace = char "{"
let rbrace = char "}"
let comma  = char ","
let colon  = char ":"

(* The encodings of a scalar JSON value *)
let scalar = [ label "str" . store str ]
           | [ label "number" . store number ]
           | [ label "bool" . store bool ]
           | [ label "null" . store null ]


(* An array of values where val expresses what the values are *)
let array (val: lens) =
  [ label "array"
    . lbrack
    . (val . (comma . val)* )?
    . rbrack ]

(* A map/object of values *)
let map (val: lens) =
  (* We require the same escaping for the characters in the key
     as we do with general strings *)
  let key_char = (/[ -~]/ - /["\]/) | /\\\\["\]/ in
  let key = store (/"/ . key_char+ ./"/) in
  (* A key/value pair in a map *)
  let kv = [ label "kv" . (key . colon . val) ] in
  [ label "map" .
    lbrace . kv . (comma . kv) * . rbrace ]

(* Helper to deal with JSON values with increasing depth of nesting *)
let json(v:lens) = scalar | (array v) | (map v)

(* json1 accepts all encodings where we only allow scalars, and maps and
   objects of scalars, json2 adds one more level of nesting etc. *)
let json1 = json scalar
let json2 = json json1
let json3 = json json2
let json4 = json json3
let _ = print_endline "json4 checked"
let json5 = json json4
let json6 = json json5
let json7 = json json6


(* Sanity check that we really do handle nesting *)
test json3 get "[t,[t],[[t]]]" =
  { "array"
    { "bool" = "t" }
    { "array"
      { "bool" = "t" } }
    { "array"
      { "array"
        { "bool" = "t" } } } }
	module Digest =

	(* Any byte *)
	let any = /.\|\n/

	(* A string is any byte, enclosed in double quotes, except that the
	double quote and backslash characters need to be escaped with
	a backslash *)
	let str =
	let str_char = /[^"\]\|\\\\["\]/ in
	"\"" . str_char* . "\""
	let _ = print_regexp str; print_endline ""

	let number = "i" . any . any . any . any . any . any . any . any
	let bool = /[tf]/
	let null = "n"

	(* Helper to handle our individual marker characters *)
	let char(s:string) = del s s
	let lbrack = char "["
	let rbrack = char "]"
	let lbrace = char "{"
	let rbrace = char "}"
	let comma = char ","
	let colon = char ":"

	(* The encodings of a scalar JSON value *)
	let scalar = [ label "str" . store str ]
	\| [ label "number" . store number ]
	\| [ label "bool" . store bool ]
	\| [ label "null" . store null ]


	(* An array of values where val expresses what the values are *)
	let array (val: lens) =
	[ label "array"
	. lbrack
	. (val . (comma . val)* )?
	. rbrack ]

	(* A map/object of values *)
	let map (val: lens) =
	(* We require the same escaping for the characters in the key
	as we do with general strings *)
	let key_char = (/[ -~]/ - /["\]/) \| /\\\\["\]/ in
	let key = store (/"/ . key_char+ ./"/) in
	(* A key/value pair in a map *)
	let kv = [ label "kv" . (key . colon . val) ] in
	[ label "map" .
	lbrace . kv . (comma . kv) * . rbrace ]

	(* Helper to deal with JSON values with increasing depth of nesting *)
	let json(v:lens) = scalar \| (array v) \| (map v)

	(* json1 accepts all encodings where we only allow scalars, and maps and
	objects of scalars, json2 adds one more level of nesting etc. *)
	let json1 = json scalar
	let json2 = json json1
	let json3 = json json2
	let json4 = json json3
	let _ = print_endline "json4 checked"
	let json5 = json json4
	let json6 = json json5
	let json7 = json json6


	(* Sanity check that we really do handle nesting *)
	test json3 get "[t,[t],[[t]]]" =
	{ "array"
	{ "bool" = "t" }
	{ "array"
	{ "bool" = "t" } }
	{ "array"
	{ "array"
	{ "bool" = "t" } } } }