calraith/json.awk

## json.awk
#!/usr/bin/gawk -f

# Example usage: LC_ALL=C ./json.awk datafile.json

{ json = json $0 }

END {
	# Build "obj" as a true multidimensional array from string data
	deserialize(json, obj)

	# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0.
	# print obj[1]["name"]

	# Stringify a multidimensional array. Indent with tabs.
	if (isarray(obj)) print serialize(obj, "\t")
}

# === FUNCTIONS ===

function join(arr, sep, _p, i) {
	# syntax: join(array, string separator)
	# returns a string

	for (i in arr) {
		_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
	}
	return _p["result"]
}

function quote(str) {
	gsub(/\\/, "\\\\", str)
	gsub(/\r/, "\\r", str)
	gsub(/\n/, "\\n", str)
	gsub(/\t/, "\\t", str)
	return "\"" str "\""
}

function serialize(arr, indent_with, depth, _p, i, idx) {
	# syntax: serialize(array of arrays, indent string)
	# returns a JSON formatted string

	# sort arrays on key, ensures [...] values remain properly ordered
	if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc"

	# determine whether array is indexed or associative
	for (i in arr) {
		_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
	}

	# if associative, indent
	if (_p["assoc"]) {
		for (i = ++depth; i--;) {
			_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
		}
	}

	for (i in arr) {
		# If key length is 0, assume its an empty object
		if (!length(i)) return "{}"

		# quote key if not already quoted
		_p["key"] = i !~ /^".*"$/ ? quote(i) : i

		if (isarray(arr[i])) {
			if (_p["assoc"]) {
				_p["json"][++idx] = _p["indent"] _p["key"] ": " \
					serialize(arr[i], indent_with, depth)
			} else {
				# if indexed array, dont print keys
				_p["json"][++idx] = serialize(arr[i], indent_with, depth)
			}
		} else {
			# quote if not numeric, boolean, null, already quoted, or too big for match()
			if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) ||
				arr[i] ~ /^true|false|null|".*"$/) || length(arr[i]) > 1000)
				arr[i] = quote(arr[i])

			_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
		}
	}

	# I trial and errored the hell out of this. Problem is, gawk can't distinguish between
	# a value of null and no value.  I think this hack is as close as I can get, although
	# [""] will become [].
	if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"

	# surround with curly braces if object, square brackets if array
	return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
		: "[" join(_p["json"], ", ") "]"
}

function trim(str) { gsub(/^\s+|\s+$/, "", str); return str }

function unquote(str) {
	gsub(/^'|'$/, "", str)
	gsub(/^"|"$/, "", str)
	gsub(/\\r/, "\r", str)
	gsub(/\\n/, "\n", str)
	gsub(/\\t/, "\t", str)
	gsub(/\\{2}/, "\\", str)
	return trim(str)
}

function tokenize(str, arr, splitchar, _p) {
	# syntax: tokenize(JSON-formatted string, array to populate, char to split on)
	# arr populates with matches split on unbracketed, unquoted splitchar
	# returns length of arr
	# This function supplants fpat / patsplit since those methods cannot reliably group
	# mated bracket pairs

	while (++_p["pos"] <= length(str)) {

		_p["char"] = substr(str, _p["pos"], 1)

		switch (_p["char"]) {
			case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break
			case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break
			case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break
			case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break
			case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break
			case "\\": _p["\\"] = !_p["\\"]; break
			default: _p["\\"] = false
		}

		if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) {
			arr[++_p["idx"]] = trim(_p["segment"])
			delete _p["segment"]
		} else {
			_p["segment"] = _p["segment"] _p["char"]
		}
	}
	arr[++_p["idx"]] = trim(_p["segment"])
	return _p["idx"]
}

function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) {
	# syntax: deserialize (JSON-formatted string, array to populate)
	# Resulting array is true multidimensional (arr[idx][idx][etc...])
	# ... not concatenated index (arr[idx,idx,etc...])

	# consume outer brackets / braces
	# note: match() failed here with very large JSON data
	json = trim(json)
	_parts[1] = substr(json, 1, 1)
	_parts[2] = substr(json, 2, length(json) - 2)

	# split on unbracketed, unquoted commas
	_p["outie"] = tokenize(trim(_parts[2]), _values, ",")

	for (i = 1; i <= _p["outie"]; i++) {

		# build associative array
		if (_parts[1] ~ "{") {

			# split on unbracketed, unquoted colons
			_p["innie"] = tokenize(trim(_values[i]), _keyval, ":")

			for (j=1; j<=_p["innie"]; j+=2) {

				# if value begins with a bracket or brace, recurse
				if (trim(_keyval[j+1]) ~ /^[\[\{]/) {

					# init array element as explicit array (defaults to scalar without this)
					arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0]

					# do recurse
					deserialize(_keyval[j+1], arr[unquote(_keyval[j])])
				} else {
					arr[unquote(_keyval[j])] = unquote(_keyval[j+1])
				}
			}

		# build numerically indexed array
		} else {

			while (++_p["idx"] in arr) {}

			# if value begins with a bracket or brace, recurse
			if (trim(_values[i]) ~ /^[\[\{]/) {

				# init array element as explicit array (defaults to scalar without this)
				arr[_p["idx"]][0]; delete arr[_p["idx"]][0]

				# do recurse
				deserialize(trim(_values[i]), arr[_p["idx"]])
			} else {
				arr[_p["idx"]] = unquote(_values[i])
			}
		}
	}
}
	#!/usr/bin/gawk -f

	# Example usage: LC_ALL=C ./json.awk datafile.json

	{ json = json $0 }

	END {
	# Build "obj" as a true multidimensional array from string data
	deserialize(json, obj)

	# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0.
	# print obj[1]["name"]

	# Stringify a multidimensional array. Indent with tabs.
	if (isarray(obj)) print serialize(obj, "\t")
	}

	# === FUNCTIONS ===

	function join(arr, sep, _p, i) {
	# syntax: join(array, string separator)
	# returns a string

	for (i in arr) {
	_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
	}
	return _p["result"]
	}

	function quote(str) {
	gsub(/\\/, "\\\\", str)
	gsub(/\r/, "\\r", str)
	gsub(/\n/, "\\n", str)
	gsub(/\t/, "\\t", str)
	return "\"" str "\""
	}

	function serialize(arr, indent_with, depth, _p, i, idx) {
	# syntax: serialize(array of arrays, indent string)
	# returns a JSON formatted string

	# sort arrays on key, ensures [...] values remain properly ordered
	if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc"

	# determine whether array is indexed or associative
	for (i in arr) {
	_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
	}

	# if associative, indent
	if (_p["assoc"]) {
	for (i = ++depth; i--;) {
	_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
	}
	}

	for (i in arr) {
	# If key length is 0, assume its an empty object
	if (!length(i)) return "{}"

	# quote key if not already quoted
	_p["key"] = i !~ /^".*"$/ ? quote(i) : i

	if (isarray(arr[i])) {
	if (_p["assoc"]) {
	_p["json"][++idx] = _p["indent"] _p["key"] ": " \
	serialize(arr[i], indent_with, depth)
	} else {
	# if indexed array, dont print keys
	_p["json"][++idx] = serialize(arr[i], indent_with, depth)
	}
	} else {
	# quote if not numeric, boolean, null, already quoted, or too big for match()
	if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) \|\|
	arr[i] ~ /^true\|false\|null\|".*"$/) \|\| length(arr[i]) > 1000)
	arr[i] = quote(arr[i])

	_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
	}
	}

	# I trial and errored the hell out of this. Problem is, gawk can't distinguish between
	# a value of null and no value. I think this hack is as close as I can get, although
	# [""] will become [].
	if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"

	# surround with curly braces if object, square brackets if array
	return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
	: "[" join(_p["json"], ", ") "]"
	}

	function trim(str) { gsub(/^\s+\|\s+$/, "", str); return str }

	function unquote(str) {
	gsub(/^'\|'$/, "", str)
	gsub(/^"\|"$/, "", str)
	gsub(/\\r/, "\r", str)
	gsub(/\\n/, "\n", str)
	gsub(/\\t/, "\t", str)
	gsub(/\\{2}/, "\\", str)
	return trim(str)
	}

	function tokenize(str, arr, splitchar, _p) {
	# syntax: tokenize(JSON-formatted string, array to populate, char to split on)
	# arr populates with matches split on unbracketed, unquoted splitchar
	# returns length of arr
	# This function supplants fpat / patsplit since those methods cannot reliably group
	# mated bracket pairs

	while (++_p["pos"] <= length(str)) {

	_p["char"] = substr(str, _p["pos"], 1)

	switch (_p["char"]) {
	case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break
	case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break
	case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break
	case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break
	case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break
	case "\\": _p["\\"] = !_p["\\"]; break
	default: _p["\\"] = false
	}

	if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) {
	arr[++_p["idx"]] = trim(_p["segment"])
	delete _p["segment"]
	} else {
	_p["segment"] = _p["segment"] _p["char"]
	}
	}
	arr[++_p["idx"]] = trim(_p["segment"])
	return _p["idx"]
	}

	function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) {
	# syntax: deserialize (JSON-formatted string, array to populate)
	# Resulting array is true multidimensional (arr[idx][idx][etc...])
	# ... not concatenated index (arr[idx,idx,etc...])

	# consume outer brackets / braces
	# note: match() failed here with very large JSON data
	json = trim(json)
	_parts[1] = substr(json, 1, 1)
	_parts[2] = substr(json, 2, length(json) - 2)

	# split on unbracketed, unquoted commas
	_p["outie"] = tokenize(trim(_parts[2]), _values, ",")

	for (i = 1; i <= _p["outie"]; i++) {

	# build associative array
	if (_parts[1] ~ "{") {

	# split on unbracketed, unquoted colons
	_p["innie"] = tokenize(trim(_values[i]), _keyval, ":")

	for (j=1; j<=_p["innie"]; j+=2) {

	# if value begins with a bracket or brace, recurse
	if (trim(_keyval[j+1]) ~ /^[\[\{]/) {

	# init array element as explicit array (defaults to scalar without this)
	arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0]

	# do recurse
	deserialize(_keyval[j+1], arr[unquote(_keyval[j])])
	} else {
	arr[unquote(_keyval[j])] = unquote(_keyval[j+1])
	}
	}

	# build numerically indexed array
	} else {

	while (++_p["idx"] in arr) {}

	# if value begins with a bracket or brace, recurse
	if (trim(_values[i]) ~ /^[\[\{]/) {

	# init array element as explicit array (defaults to scalar without this)
	arr[_p["idx"]][0]; delete arr[_p["idx"]][0]

	# do recurse
	deserialize(trim(_values[i]), arr[_p["idx"]])
	} else {
	arr[_p["idx"]] = unquote(_values[i])
	}
	}
	}
	}