Skip to content

Instantly share code, notes, and snippets.

@whizzmler
Forked from osroca/README.md
Last active May 15, 2016 03:50
Show Gist options
  • Save whizzmler/9b74355ea3e1f688378a9c379eabedb9 to your computer and use it in GitHub Desktop.
Save whizzmler/9b74355ea3e1f688378a9c379eabedb9 to your computer and use it in GitHub Desktop.
Remove anomalies from dataset
{
"name": "Normalize Dataset",
"description": "Remove the top n anomalies from a dataset",
"inputs": [
{"name": "dataset-id", "type": "dataset-id", "description": "Dataset Id"},
{"name": "top-n", "type": "number", "description": "Top N Anomalies to Remove"}
],
"outputs": [
{"name": "normalized-dataset", "type": "dataset-id", "description": "Normalized Dataset"}
],
"total_resources_created": 1,
"resource_to_apply": "dataset"
}
;; Remove the top n anomalies from a dataset
;; given an anomaly resource, get the list of its top_anomaly row numbers
(define (anomalous-rows a)
(map (lambda (x) (get x "row_number"))
(get-in a ["model" "top_anomalies"])))
;; given a list of row numbers, generate a flatline expression
;; that discards those rows.
(define (row-filter rows)
(let (eqs (map (lambda (n) (flatline "(= (row-number) {n})")) rows))
(flatline "(not (or @{eqs}))")))
;; given a dataset and a number of anomalies, generate a new one
;; that removes from the original the anomalous rows.
(define (normalize-dataset dataset-id n)
(let (a-id (create-and-wait-anomaly {"dataset" dataset-id "top_n" n})
anomaly (fetch a-id {"exclude" "trees,fields"})
rows (anomalous-rows anomaly)
filter (row-filter rows))
(log-info "Deleting rows " rows)
(log-info "Using filter " filter)
(delete a-id) ;; or we could keep it
(create-and-wait-dataset {"origin_dataset" dataset-id
"lisp_filter" filter})))
;; script with parameters dataset-id and top-n
(define normalized-dataset (normalize-dataset dataset-id top-n))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment