Last active
May 12, 2016 16:48
-
-
Save petersen-poul/504c62ceaace76227cc6d8e0c5f1704b to your computer and use it in GitHub Desktop.
JSON key/val extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "JSON key/val extraction", | |
"description": "Given a dataset field containing JSON documents and a key, this WhizzML script creates a new feature with the JSON values. This is a hack and *NOT* a valid JSON parser", | |
"inputs": [ | |
{ | |
"name": "dataset-in", | |
"type": "dataset-id", | |
"description": "Dataset to transform by extracting JSON values." | |
}, | |
{ | |
"name": "json-col", | |
"type": "string", | |
"description": "The name of the column/feature in the dataset which contains JSON records." | |
}, | |
{ | |
"name": "key", | |
"type": "string", | |
"description": "The JSON key to extract from the records. The new values will be placed into a field name {json-col}.{key}." | |
} | |
], | |
"outputs": [ | |
{ | |
"name": "dataset-out", | |
"type": "dataset-id", | |
"description": "The extended dataset." | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; This is NOT a valid JSON parser and will not work for everything. In | |
; particular, the behavior with multi-level keys will be "surprising". | |
; For example, in the record: | |
; | |
; { "foo": { "bar": { 1:2 }, "baz": {3:4} } | |
; | |
; The value {1:2} can be extracted with only the key "bar" - you do not | |
; need to specify a multi-level key including "foo". That might be kind | |
; of neat, but trying to extract "foo" from a similar record with more | |
; subkeys will give you a broken result if there are more than 9 subkeys. | |
; Or a list of lists with more than 9 sublists, etc. | |
(define (json-extract dataset json-col key) | |
(let ( | |
; This regex splits the JSON string at the key, leaving everything to the right | |
re-key (str ".*[\"\\s,\\{]" key "[\"\\s]*:\\s*(.*)") | |
re-to "$1" | |
; This regex extracts map values, that is key:{ ... } | |
re-1brace "(\\{([^\\}]+\\}){1}).*" | |
re-2brace "(\\{([^\\}]+\\}){2}).*" | |
re-3brace "(\\{([^\\}]+\\}){3}).*" | |
re-4brace "(\\{([^\\}]+\\}){4}).*" | |
re-5brace "(\\{([^\\}]+\\}){5}).*" | |
re-6brace "(\\{([^\\}]+\\}){6}).*" | |
re-7brace "(\\{([^\\}]+\\}){7}).*" | |
re-8brace "(\\{([^\\}]+\\}){8}).*" | |
re-9brace "(\\{([^\\}]+\\}){9}).*" | |
; This regex extracts lists, key:[ ... ] | |
re-1brack "(\\[([^\\]]+\\]){1}).*" | |
re-2brack "(\\[([^\\]]+\\]){2}).*" | |
re-3brack "(\\[([^\\]]+\\]){3}).*" | |
re-4brack "(\\[([^\\]]+\\]){4}).*" | |
re-5brack "(\\[([^\\]]+\\]){5}).*" | |
re-6brack "(\\[([^\\]]+\\]){6}).*" | |
re-7brack "(\\[([^\\]]+\\]){7}).*" | |
re-8brack "(\\[([^\\]]+\\]){8}).*" | |
re-9brack "(\\[([^\\]]+\\]){9}).*" | |
; Extracts quoted values | |
re-lquote "(\"[^\"]+\").*" | |
; Extracts numbers | |
re-space "(.*)\\s*,.*" | |
; We need these to work around the difficulty of escaping in flatline | |
re-lbrace "\\{" | |
lbrace "{" | |
rbrace "}" | |
re-rbrace "\\}" | |
lbrack "[" | |
re-lbrack "\\[" | |
rbrack "]" | |
re-rbrack "\\]" | |
lquote "\"") | |
(create-and-wait-dataset { | |
"origin_dataset" dataset | |
"new_fields" [ { | |
"name" (str json-col "." key) | |
"field" (flatline "(let (" | |
"orig (field {{json-col}}) " | |
"right (replace orig {{re-key}} {{re-to}}) " | |
"char (subs right 0 1))" | |
"( if ( != orig right ) (cond " | |
"(= char {{lbrace}}) (let (" | |
"s-1brace (replace right {{re-1brace}} {{re-to}})" | |
"s-1brace-lcount (- (length s-1brace) (length (replace s-1brace {{re-lbrace}} \"\")))" | |
"s-1brace-rcount (- (length s-1brace) (length (replace s-1brace {{re-rbrace}} \"\")))" | |
"s-2brace (replace right {{re-2brace}} {{re-to}})" | |
"s-2brace-lcount (- (length s-2brace) (length (replace s-2brace {{re-lbrace}} \"\")))" | |
"s-2brace-rcount (- (length s-2brace) (length (replace s-2brace {{re-rbrace}} \"\")))" | |
"s-3brace (replace right {{re-3brace}} {{re-to}})" | |
"s-3brace-lcount (- (length s-3brace) (length (replace s-3brace {{re-lbrace}} \"\")))" | |
"s-3brace-rcount (- (length s-3brace) (length (replace s-3brace {{re-rbrace}} \"\")))" | |
"s-4brace (replace right {{re-4brace}} {{re-to}})" | |
"s-4brace-lcount (- (length s-4brace) (length (replace s-4brace {{re-lbrace}} \"\")))" | |
"s-4brace-rcount (- (length s-4brace) (length (replace s-4brace {{re-rbrace}} \"\")))" | |
"s-5brace (replace right {{re-5brace}} {{re-to}})" | |
"s-5brace-lcount (- (length s-5brace) (length (replace s-5brace {{re-lbrace}} \"\")))" | |
"s-5brace-rcount (- (length s-5brace) (length (replace s-5brace {{re-rbrace}} \"\")))" | |
"s-6brace (replace right {{re-6brace}} {{re-to}})" | |
"s-6brace-lcount (- (length s-6brace) (length (replace s-6brace {{re-lbrace}} \"\")))" | |
"s-6brace-rcount (- (length s-6brace) (length (replace s-6brace {{re-rbrace}} \"\")))" | |
"s-7brace (replace right {{re-7brace}} {{re-to}})" | |
"s-7brace-lcount (- (length s-7brace) (length (replace s-7brace {{re-lbrace}} \"\")))" | |
"s-7brace-rcount (- (length s-7brace) (length (replace s-7brace {{re-rbrace}} \"\")))" | |
"s-8brace (replace right {{re-8brace}} {{re-to}})" | |
"s-8brace-lcount (- (length s-8brace) (length (replace s-8brace {{re-lbrace}} \"\")))" | |
"s-8brace-rcount (- (length s-8brace) (length (replace s-8brace {{re-rbrace}} \"\")))" | |
"s-9brace (replace right {{re-9brace}} {{re-to}})" | |
"s-9brace-lcount (- (length s-9brace) (length (replace s-9brace {{re-lbrace}} \"\")))" | |
"s-9brace-rcount (- (length s-9brace) (length (replace s-9brace {{re-rbrace}} \"\"))))" | |
"(cond " | |
"(= s-1brace-lcount s-1brace-rcount ) s-1brace" | |
"(= s-2brace-lcount s-2brace-rcount ) s-2brace" | |
"(= s-3brace-lcount s-3brace-rcount ) s-3brace" | |
"(= s-4brace-lcount s-4brace-rcount ) s-4brace" | |
"(= s-5brace-lcount s-5brace-rcount ) s-5brace" | |
"(= s-6brace-lcount s-6brace-rcount ) s-6brace" | |
"(= s-7brace-lcount s-7brace-rcount ) s-7brace" | |
"(= s-8brace-lcount s-8brace-rcount ) s-8brace" | |
"(= s-9brace-lcount s-9brace-rcount ) s-9brace" | |
"FAIL))" | |
"(= char {{lbrack}}) (let (" | |
"s-1brack (replace right {{re-1brack}} {{re-to}})" | |
"s-1brack-lcount (- (length s-1brack) (length (replace s-1brack {{re-lbrack}} \"\")))" | |
"s-1brack-rcount (- (length s-1brack) (length (replace s-1brack {{re-rbrack}} \"\")))" | |
"s-2brack (replace right {{re-2brack}} {{re-to}})" | |
"s-2brack-lcount (- (length s-2brack) (length (replace s-2brack {{re-lbrack}} \"\")))" | |
"s-2brack-rcount (- (length s-2brack) (length (replace s-2brack {{re-rbrack}} \"\")))" | |
"s-3brack (replace right {{re-3brack}} {{re-to}})" | |
"s-3brack-lcount (- (length s-3brack) (length (replace s-3brack {{re-lbrack}} \"\")))" | |
"s-3brack-rcount (- (length s-3brack) (length (replace s-3brack {{re-rbrack}} \"\")))" | |
"s-4brack (replace right {{re-4brack}} {{re-to}})" | |
"s-4brack-lcount (- (length s-4brack) (length (replace s-4brack {{re-lbrack}} \"\")))" | |
"s-4brack-rcount (- (length s-4brack) (length (replace s-4brack {{re-rbrack}} \"\")))" | |
"s-5brack (replace right {{re-5brack}} {{re-to}})" | |
"s-5brack-lcount (- (length s-5brack) (length (replace s-5brack {{re-lbrack}} \"\")))" | |
"s-5brack-rcount (- (length s-5brack) (length (replace s-5brack {{re-rbrack}} \"\")))" | |
"s-6brack (replace right {{re-6brack}} {{re-to}})" | |
"s-6brack-lcount (- (length s-6brack) (length (replace s-6brack {{re-lbrack}} \"\")))" | |
"s-6brack-rcount (- (length s-6brack) (length (replace s-6brack {{re-rbrack}} \"\")))" | |
"s-7brack (replace right {{re-7brack}} {{re-to}})" | |
"s-7brack-lcount (- (length s-7brack) (length (replace s-7brack {{re-lbrack}} \"\")))" | |
"s-7brack-rcount (- (length s-7brack) (length (replace s-7brack {{re-rbrack}} \"\")))" | |
"s-8brack (replace right {{re-8brack}} {{re-to}})" | |
"s-8brack-lcount (- (length s-8brack) (length (replace s-8brack {{re-lbrack}} \"\")))" | |
"s-8brack-rcount (- (length s-8brack) (length (replace s-8brack {{re-rbrack}} \"\")))" | |
"s-9brack (replace right {{re-9brack}} {{re-to}})" | |
"s-9brack-lcount (- (length s-9brack) (length (replace s-9brack {{re-lbrack}} \"\")))" | |
"s-9brack-rcount (- (length s-9brack) (length (replace s-9brack {{re-rbrack}} \"\"))))" | |
"(cond " | |
"(= s-1brack-lcount s-1brack-rcount ) s-1brack" | |
"(= s-2brack-lcount s-2brack-rcount ) s-2brack" | |
"(= s-3brack-lcount s-3brack-rcount ) s-3brack" | |
"(= s-4brack-lcount s-4brack-rcount ) s-4brack" | |
"(= s-5brack-lcount s-5brack-rcount ) s-5brack" | |
"(= s-6brack-lcount s-6brack-rcount ) s-6brack" | |
"(= s-7brack-lcount s-7brack-rcount ) s-7brack" | |
"(= s-8brack-lcount s-8brack-rcount ) s-8brack" | |
"(= s-9brack-lcount s-9brack-rcount ) s-9brack" | |
"FAIL))" | |
"(= char {{lquote}}) (replace right {{re-lquote}} {{re-to}})" | |
"(replace right {{re-space}} {{re-to}})) ""))") | |
}]}))) | |
(define dataset-out (json-extract dataset-in json-col key)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment