Skip to content

Instantly share code, notes, and snippets.

@sritchie
Created February 27, 2011 01:21
Show Gist options
  • Save sritchie/845813 to your computer and use it in GitHub Desktop.
Save sritchie/845813 to your computer and use it in GitHub Desktop.
;; ## Example of timeseries aggregation in cascalog.
;;
;; (copy paste all of this in at the REPL!)
(use 'cascalog.api)
(def tseries [["ndvi" 1 0 [1 2 3 4]]
["ndvi" 1 2 [2 3 4 5]]
["ndvi" 1 1 [4 3 2 1]]
["ndvi" 1 4 [1 2 3 4]]
["ndvi" 1 3 [1 2 3 4]]
["evi" 2 0 [1 2 3 4]]
["evi" 2 1 [1 2 3 4]]
["evi" 2 2 [1 2 3 4]]
["evi" 2 3 [1 2 3 4]]
["evi" 2 4 [1 2 3 4]]
["ndvi" 2 0 [1 2 3 4]]
["ndvi" 2 1 [1 2 3 4]]
["ndvi" 2 2 [1 2 3 4]]
["ndvi" 2 3 [1 2 3 4]]
["ndvi" 2 4 [1 2 3 4]]
["evi" 3 0 [1 2 3 4]]
["evi" 3 1 [1 2 3 4]]
["evi" 3 2 [1 2 3 4]]
["evi" 3 3 [1 2 3 4]]
["evi" 3 4 [1 2 3 4]]])
(defbufferop tuples->string
[tuples]
[(apply str (map str tuples))])
(defn tester-strings []
(?<- (stdout)
[?dataset ?tileid ?tuples]
(tseries ?dataset ?tileid ?tperiod ?chunk)
(:sort ?tperiod)
(tuples->string ?tperiod ?chunk :> ?tuples)))
;; RESULTS
;; -----------------------
;; ndvi 1 (0 [1 2 3 4])(1 [4 3 2 1])(2 [2 3 4 5])(3 [1 2 3 4])(4 [1 2 3 4])
;; evi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
;; ndvi 2 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
;; evi 3 (0 [1 2 3 4])(1 [1 2 3 4])(2 [1 2 3 4])(3 [1 2 3 4])(4 [1 2 3 4])
;; -----------------------
(defbufferop
^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
by time period, and transposes these into (n = chunk-size) 4-tuples,
formatted as <pixel-idx, t-start, t-end, t-series>, where the
`t-series` field is represented by a vector. Entering chunks
should be sorted in ascending order."}
timeseries [tuples]
(let [[periods chunks] (apply map vector tuples)
periodize (partial vector
(first periods)
(last periods))
tupleize (comp periodize vector)]
(map-indexed cons (apply map tupleize chunks))))
(defn tester-tseries []
(?<- (stdout)
[?dataset ?tileid ?pix-idx ?t-start ?t-end ?tseries]
(tseries ?dataset ?tileid ?tperiod ?chunk)
(:sort ?tperiod)
(timeseries ?tperiod ?chunk :> ?pix-idx ?t-start ?t-end ?tseries)))
;; RESULTS
;; -----------------------
;; ndvi 1 0 0 4 [1 4 2 1 1]
;; ndvi 1 1 0 4 [2 3 3 2 2]
;; ndvi 1 2 0 4 [3 2 4 3 3]
;; ndvi 1 3 0 4 [4 1 5 4 4]
;; evi 2 0 0 4 [1 1 1 1 1]
;; evi 2 1 0 4 [2 2 2 2 2]
;; evi 2 2 0 4 [3 3 3 3 3]
;; evi 2 3 0 4 [4 4 4 4 4]
;; ndvi 2 0 0 4 [1 1 1 1 1]
;; ndvi 2 1 0 4 [2 2 2 2 2]
;; ndvi 2 2 0 4 [3 3 3 3 3]
;; ndvi 2 3 0 4 [4 4 4 4 4]
;; evi 3 0 0 4 [1 1 1 1 1]
;; evi 3 1 0 4 [2 2 2 2 2]
;; evi 3 2 0 4 [3 3 3 3 3]
;; evi 3 3 0 4 [4 4 4 4 4]
;; -----------------------
;; The following version gives us int-arrays
;; instead, which serialize much smaller. This is what we'll use.
(defbufferop
^{:doc "Takes in a number of <t-period, modis-chunk> tuples, sorted
by time period, and transposes these into (n = chunk-size) 4-tuples,
formatted as <pixel-idx, t-start, t-end, t-series>, where the
`t-series` field is represented by an int-array. Entering chunks
should be sorted in ascending order."}
timeseries [tuples]
(let [[periods chunks] (apply map vector tuples)
[fp lp] ((juxt first peek) periods)
tupleize (comp (partial vector fp lp)
int-array
vector)]
(->> chunks
(apply map tupleize)
(map-indexed cons))))
;; More complex version, with sparse-expander.
;; You'll need the following in your thrift file: https://gist.github.com/1058480
;; And these functions (this is the io namespace referenced below): https://gist.github.com/1058476
(defn sparse-expander
"Takes in a sequence of 2-tuples of the form `<idx, val>` and
generates a sparse expansion with each `val` inserted at its
corresponding `idx`. Missing values will be set to the supplied
placeholder.
If no starting index is supplied, `sparse-expander` assumes that
counting begins with the first `<idx, val>` pair."
[placeholder tuples & {:keys [start length]}]
(let [start (or start (ffirst tuples))
halt? (fn [idx tup-seq]
(if length
(>= idx (+ start length))
(empty? tup-seq)))]
(loop [idx start
tup-seq tuples
v (transient [])]
(let [[[pos val] & more] tup-seq]
(cond (halt? idx tup-seq) (persistent! v)
(when pos (= idx pos)) (recur (inc idx) more (conj! v val))
(when pos (> idx pos)) (recur (inc idx) more (conj! v placeholder))
:else (recur (inc idx) tup-seq (conj! v placeholder)))))))
(defbufferop [timeseries [missing-val]]
"Takes in a number of `<t-period, modis-chunk>` tuples,
sorted by time period, and transposes these into (n = chunk-size)
4-tuples, formatted as <pixel-idx, t-start, t-end, t-series>, where
the `t-series` field is represented by an instance of
`forma.schema.DoubleArray`.
Entering chunks should be sorted by `t-period` in ascending
order. `modis-chunk` tuple fields must be vectors or instances of
`forma.schema.DoubleArray` or `forma.schema.IntArray`, as dictated
by the Thriftable interface in `forma.hadoop.io`."
[tuples]
(let [[periods [val]] (apply map vector tuples)
[fp lp] ((juxt first peek) periods)
missing-struct (io/to-struct (repeat (io/count-vals val) missing-val))
chunks (sparse-expander missing-struct tuples :start fp)
tupleize (comp (partial vector fp lp)
io/to-struct
vector)]
(->> chunks
(map io/get-vals)
(apply map tupleize)
(map-indexed cons))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment