Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Last active May 12, 2016 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petersen-poul/c4f974773f460051ba8cbcaf42eb914f to your computer and use it in GitHub Desktop.
Save petersen-poul/c4f974773f460051ba8cbcaf42eb914f to your computer and use it in GitHub Desktop.
{
"name": "Simple JSON key/val extraction",
"description": "Given a dataset field containing JSON documents and a key, this WhizzML script creates a new feature with the JSON values. This is a hack and *NOT* a valid JSON parser",
"inputs": [
{
"name": "dataset-in",
"type": "dataset-id",
"description": "Dataset to transform by extracting JSON values."
},
{
"name": "json-col",
"type": "string",
"description": "The name of the column/feature in the dataset which contains JSON records."
},
{
"name": "key",
"type": "string",
"description": "The JSON key to extract from the records. The new values will be placed into a field name {json-col}.{key}."
}
],
"outputs": [
{
"name": "dataset-out",
"type": "dataset-id",
"description": "The extended dataset."
}
]
}
; This is NOT a valid JSON parser and will not work for everything. In
; particular, the behavior with multi-level keys will be "surprising".
; For example, in the record:
;
; { "foo": { "bar": { 1:2 }, "baz": {3:4} }
;
; The value {1:2} can be extracted with only the key "bar" - you do not
; need to specify a multi-level key including "foo". That might be kind
; of neat, but trying to extract "foo" from a similar record with more
; subkeys will give you a broken result if there are more than 9 subkeys.
; Or a list of lists with more than 9 sublists, etc.
(define (json-extract dataset json-col key)
(let (
; This regex splits the JSON string at the key, leaving everything to the right
re-key (str ".*[\"\\s,\\{]" key "[\"\\s]*:\\s*(.*)")
re-to "$1"
; This regex extracts map values, that is key:{ ... }
re-brace "(\\{([^\\}]+\\}){1}).*"
; This regex extracts lists, key:[ ... ]
re-brack "(\\[([^\\]]+\\]){1}).*"
; Extracts quoted values
re-quote "(\"[^\"]+\").*"
; Extracts numbers
re-space "(.*)\\s*,.*"
; We need these to work around the difficulty of escaping in flatline
re-lbrace "\\{"
lbrace "{"
rbrace "}"
re-rbrace "\\}"
lbrack "["
re-lbrack "\\["
rbrack "]"
re-rbrack "\\]"
lquote "\"")
(create-and-wait-dataset {
"origin_dataset" dataset
"new_fields" [ {
"name" (str json-col "." key)
"field" (flatline "(let ("
"orig (field {{json-col}}) "
"right (replace orig {{re-key}} {{re-to}}) "
"char (subs right 0 1))"
"( if ( != orig right ) (cond "
"(= char {{lbrace}})"
"(replace right {{re-brace}} {{re-to}})"
"(= char {{lbrack}})"
"(replace right {{re-brack}} {{re-to}})"
"(= char {{lquote}})"
"(replace right {{re-quote}} {{re-to}})"
"(replace right {{re-space}} {{re-to}}))))")
}]})))
(define dataset-out (json-extract dataset-in json-col key))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment