Skip to content

Instantly share code, notes, and snippets.

@jaor
Last active December 22, 2017 02:39
Show Gist options
  • Save jaor/672d1aacb8599f1b3ca54d9eb20f6b0c to your computer and use it in GitHub Desktop.
Save jaor/672d1aacb8599f1b3ca54d9eb20f6b0c to your computer and use it in GitHub Desktop.
Remove text field terms
{
"name": "Prune terms",
"kind": "script",
"description": "Removes from a text fields all entries with low coverage",
"source_code": "script.whizzml",
"inputs":[
{
"name": "dataset-id",
"type": "dataset-id",
"description": "The original dataset"
},
{
"name": "field",
"type": "string",
"description": "The field id or name to prune"
},
{
"name": "threshold",
"type": "number",
"description": "The minimum number of occurrences to keep a category",
"default": 10
},
{
"name": "low-name",
"type": "string",
"description": "Name of the category for low-coverage terms",
"default": "low-coverage"
}
],
"outputs":[
{
"name": "result",
"type": "dataset-id",
"description": "output dataset with pruned field"
}]
}
(define (fl-trans fd low-name)
(when (not (text-field? fd))
(raise "Specified field is not a text field"))
(let (cnts (filter (lambda (x) (> (nth x 1) threshold))
(field-distribution fd))
hi-tags (map head cnts)
cond-terms (map (lambda (tag) (flatline " (= {{tag}} x) x ")) hi-tags)
id (fd "id"))
(flatline "(let (x (f {{id}})) (cond @{cond-terms} {{low-name}}))")))
(define (categorize-field dataset-id field-id low-name)
(let (fd (or (find-field ((fetch dataset-id) "fields") field-id)
(raise "Field not found"))
id (fd "id")
fl (fl-trans fd low-name))
(create-dataset dataset-id
{"all_but" [id]
"new_fields" [{"field" fl "name" (fd "name")}]})))
(define result (wait (categorize-field dataset-id field low-name)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment