Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Last active May 12, 2016 16:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petersen-poul/504c62ceaace76227cc6d8e0c5f1704b to your computer and use it in GitHub Desktop.
Save petersen-poul/504c62ceaace76227cc6d8e0c5f1704b to your computer and use it in GitHub Desktop.
JSON key/val extraction
{
"name": "JSON key/val extraction",
"description": "Given a dataset field containing JSON documents and a key, this WhizzML script creates a new feature with the JSON values. This is a hack and *NOT* a valid JSON parser",
"inputs": [
{
"name": "dataset-in",
"type": "dataset-id",
"description": "Dataset to transform by extracting JSON values."
},
{
"name": "json-col",
"type": "string",
"description": "The name of the column/feature in the dataset which contains JSON records."
},
{
"name": "key",
"type": "string",
"description": "The JSON key to extract from the records. The new values will be placed into a field name {json-col}.{key}."
}
],
"outputs": [
{
"name": "dataset-out",
"type": "dataset-id",
"description": "The extended dataset."
}
]
}
; This is NOT a valid JSON parser and will not work for everything. In
; particular, the behavior with multi-level keys will be "surprising".
; For example, in the record:
;
; { "foo": { "bar": { 1:2 }, "baz": {3:4} }
;
; The value {1:2} can be extracted with only the key "bar" - you do not
; need to specify a multi-level key including "foo". That might be kind
; of neat, but trying to extract "foo" from a similar record with more
; subkeys will give you a broken result if there are more than 9 subkeys.
; Or a list of lists with more than 9 sublists, etc.
(define (json-extract dataset json-col key)
(let (
; This regex splits the JSON string at the key, leaving everything to the right
re-key (str ".*[\"\\s,\\{]" key "[\"\\s]*:\\s*(.*)")
re-to "$1"
; This regex extracts map values, that is key:{ ... }
re-1brace "(\\{([^\\}]+\\}){1}).*"
re-2brace "(\\{([^\\}]+\\}){2}).*"
re-3brace "(\\{([^\\}]+\\}){3}).*"
re-4brace "(\\{([^\\}]+\\}){4}).*"
re-5brace "(\\{([^\\}]+\\}){5}).*"
re-6brace "(\\{([^\\}]+\\}){6}).*"
re-7brace "(\\{([^\\}]+\\}){7}).*"
re-8brace "(\\{([^\\}]+\\}){8}).*"
re-9brace "(\\{([^\\}]+\\}){9}).*"
; This regex extracts lists, key:[ ... ]
re-1brack "(\\[([^\\]]+\\]){1}).*"
re-2brack "(\\[([^\\]]+\\]){2}).*"
re-3brack "(\\[([^\\]]+\\]){3}).*"
re-4brack "(\\[([^\\]]+\\]){4}).*"
re-5brack "(\\[([^\\]]+\\]){5}).*"
re-6brack "(\\[([^\\]]+\\]){6}).*"
re-7brack "(\\[([^\\]]+\\]){7}).*"
re-8brack "(\\[([^\\]]+\\]){8}).*"
re-9brack "(\\[([^\\]]+\\]){9}).*"
; Extracts quoted values
re-lquote "(\"[^\"]+\").*"
; Extracts numbers
re-space "(.*)\\s*,.*"
; We need these to work around the difficulty of escaping in flatline
re-lbrace "\\{"
lbrace "{"
rbrace "}"
re-rbrace "\\}"
lbrack "["
re-lbrack "\\["
rbrack "]"
re-rbrack "\\]"
lquote "\"")
(create-and-wait-dataset {
"origin_dataset" dataset
"new_fields" [ {
"name" (str json-col "." key)
"field" (flatline "(let ("
"orig (field {{json-col}}) "
"right (replace orig {{re-key}} {{re-to}}) "
"char (subs right 0 1))"
"( if ( != orig right ) (cond "
"(= char {{lbrace}}) (let ("
"s-1brace (replace right {{re-1brace}} {{re-to}})"
"s-1brace-lcount (- (length s-1brace) (length (replace s-1brace {{re-lbrace}} \"\")))"
"s-1brace-rcount (- (length s-1brace) (length (replace s-1brace {{re-rbrace}} \"\")))"
"s-2brace (replace right {{re-2brace}} {{re-to}})"
"s-2brace-lcount (- (length s-2brace) (length (replace s-2brace {{re-lbrace}} \"\")))"
"s-2brace-rcount (- (length s-2brace) (length (replace s-2brace {{re-rbrace}} \"\")))"
"s-3brace (replace right {{re-3brace}} {{re-to}})"
"s-3brace-lcount (- (length s-3brace) (length (replace s-3brace {{re-lbrace}} \"\")))"
"s-3brace-rcount (- (length s-3brace) (length (replace s-3brace {{re-rbrace}} \"\")))"
"s-4brace (replace right {{re-4brace}} {{re-to}})"
"s-4brace-lcount (- (length s-4brace) (length (replace s-4brace {{re-lbrace}} \"\")))"
"s-4brace-rcount (- (length s-4brace) (length (replace s-4brace {{re-rbrace}} \"\")))"
"s-5brace (replace right {{re-5brace}} {{re-to}})"
"s-5brace-lcount (- (length s-5brace) (length (replace s-5brace {{re-lbrace}} \"\")))"
"s-5brace-rcount (- (length s-5brace) (length (replace s-5brace {{re-rbrace}} \"\")))"
"s-6brace (replace right {{re-6brace}} {{re-to}})"
"s-6brace-lcount (- (length s-6brace) (length (replace s-6brace {{re-lbrace}} \"\")))"
"s-6brace-rcount (- (length s-6brace) (length (replace s-6brace {{re-rbrace}} \"\")))"
"s-7brace (replace right {{re-7brace}} {{re-to}})"
"s-7brace-lcount (- (length s-7brace) (length (replace s-7brace {{re-lbrace}} \"\")))"
"s-7brace-rcount (- (length s-7brace) (length (replace s-7brace {{re-rbrace}} \"\")))"
"s-8brace (replace right {{re-8brace}} {{re-to}})"
"s-8brace-lcount (- (length s-8brace) (length (replace s-8brace {{re-lbrace}} \"\")))"
"s-8brace-rcount (- (length s-8brace) (length (replace s-8brace {{re-rbrace}} \"\")))"
"s-9brace (replace right {{re-9brace}} {{re-to}})"
"s-9brace-lcount (- (length s-9brace) (length (replace s-9brace {{re-lbrace}} \"\")))"
"s-9brace-rcount (- (length s-9brace) (length (replace s-9brace {{re-rbrace}} \"\"))))"
"(cond "
"(= s-1brace-lcount s-1brace-rcount ) s-1brace"
"(= s-2brace-lcount s-2brace-rcount ) s-2brace"
"(= s-3brace-lcount s-3brace-rcount ) s-3brace"
"(= s-4brace-lcount s-4brace-rcount ) s-4brace"
"(= s-5brace-lcount s-5brace-rcount ) s-5brace"
"(= s-6brace-lcount s-6brace-rcount ) s-6brace"
"(= s-7brace-lcount s-7brace-rcount ) s-7brace"
"(= s-8brace-lcount s-8brace-rcount ) s-8brace"
"(= s-9brace-lcount s-9brace-rcount ) s-9brace"
"FAIL))"
"(= char {{lbrack}}) (let ("
"s-1brack (replace right {{re-1brack}} {{re-to}})"
"s-1brack-lcount (- (length s-1brack) (length (replace s-1brack {{re-lbrack}} \"\")))"
"s-1brack-rcount (- (length s-1brack) (length (replace s-1brack {{re-rbrack}} \"\")))"
"s-2brack (replace right {{re-2brack}} {{re-to}})"
"s-2brack-lcount (- (length s-2brack) (length (replace s-2brack {{re-lbrack}} \"\")))"
"s-2brack-rcount (- (length s-2brack) (length (replace s-2brack {{re-rbrack}} \"\")))"
"s-3brack (replace right {{re-3brack}} {{re-to}})"
"s-3brack-lcount (- (length s-3brack) (length (replace s-3brack {{re-lbrack}} \"\")))"
"s-3brack-rcount (- (length s-3brack) (length (replace s-3brack {{re-rbrack}} \"\")))"
"s-4brack (replace right {{re-4brack}} {{re-to}})"
"s-4brack-lcount (- (length s-4brack) (length (replace s-4brack {{re-lbrack}} \"\")))"
"s-4brack-rcount (- (length s-4brack) (length (replace s-4brack {{re-rbrack}} \"\")))"
"s-5brack (replace right {{re-5brack}} {{re-to}})"
"s-5brack-lcount (- (length s-5brack) (length (replace s-5brack {{re-lbrack}} \"\")))"
"s-5brack-rcount (- (length s-5brack) (length (replace s-5brack {{re-rbrack}} \"\")))"
"s-6brack (replace right {{re-6brack}} {{re-to}})"
"s-6brack-lcount (- (length s-6brack) (length (replace s-6brack {{re-lbrack}} \"\")))"
"s-6brack-rcount (- (length s-6brack) (length (replace s-6brack {{re-rbrack}} \"\")))"
"s-7brack (replace right {{re-7brack}} {{re-to}})"
"s-7brack-lcount (- (length s-7brack) (length (replace s-7brack {{re-lbrack}} \"\")))"
"s-7brack-rcount (- (length s-7brack) (length (replace s-7brack {{re-rbrack}} \"\")))"
"s-8brack (replace right {{re-8brack}} {{re-to}})"
"s-8brack-lcount (- (length s-8brack) (length (replace s-8brack {{re-lbrack}} \"\")))"
"s-8brack-rcount (- (length s-8brack) (length (replace s-8brack {{re-rbrack}} \"\")))"
"s-9brack (replace right {{re-9brack}} {{re-to}})"
"s-9brack-lcount (- (length s-9brack) (length (replace s-9brack {{re-lbrack}} \"\")))"
"s-9brack-rcount (- (length s-9brack) (length (replace s-9brack {{re-rbrack}} \"\"))))"
"(cond "
"(= s-1brack-lcount s-1brack-rcount ) s-1brack"
"(= s-2brack-lcount s-2brack-rcount ) s-2brack"
"(= s-3brack-lcount s-3brack-rcount ) s-3brack"
"(= s-4brack-lcount s-4brack-rcount ) s-4brack"
"(= s-5brack-lcount s-5brack-rcount ) s-5brack"
"(= s-6brack-lcount s-6brack-rcount ) s-6brack"
"(= s-7brack-lcount s-7brack-rcount ) s-7brack"
"(= s-8brack-lcount s-8brack-rcount ) s-8brack"
"(= s-9brack-lcount s-9brack-rcount ) s-9brack"
"FAIL))"
"(= char {{lquote}}) (replace right {{re-lquote}} {{re-to}})"
"(replace right {{re-space}} {{re-to}})) ""))")
}]})))
(define dataset-out (json-extract dataset-in json-col key))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment