Skip to content

Instantly share code, notes, and snippets.

View sritchie's full-sized avatar
🎯
Focusing

Sam Ritchie sritchie

🎯
Focusing
View GitHub Profile
@sritchie
sritchie / WholeFile.java
Created February 2, 2011 17:32
Hadoop input format for swallowing entire files.
package forma;
import forma.WholeFileInputFormat;
import cascading.scheme.Scheme;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import java.io.IOException;
import org.apache.hadoop.mapred.JobConf;
(defn whole-file
"Custom scheme for dealing with entire files."
[field-name]
(WholeFile. (w/fields field-name)))
(defn hfs-wholefile
"Creates a tap on HDFS using the wholefile format. Guaranteed not
to chop files up! Required for unsupported compression formats like HDF."
[path]
(w/hfs-tap (whole-file ["file"]) path))
<!-- this one fails. -->
<?xml version="1.0"?>
<!-- core-site.xml -->
<configuration>
<property>
<name>io.serializations</name>
<value>cascading.tuple.hadoop.BytesSerialization</value>
</property>
</configuration>
(def modis-blanks
(let [offsets [14 11 9 6 4 2 1 0 0 0 0 0 1 2 4 6 9 11 14]]
(union (set (for [x (range 18) y (range (offsets x))] [x y]))
(set (for [x (range 18) y (range (- 36 (offsets x)) 36)] [x y])))))
@sritchie
sritchie / avg.clj
Created February 9, 2011 07:00
moving averages.
;; We define simpler functions first, so we know here that this'll get used below...
(defn average
"Takes the average of all numbers in the supplied collection."
[coll]
(/ (apply + coll)
(count coll)))
;; here's a nice, effectively one line solution. (It's often the case that the docstring will be longer than the function itself.)
(defn moving-average
"Returns a moving average of windows into the supplied
;; This one?
(defn delta [f start end]
(- (f end) (f start)))
;; or this one?
(defn delta [f start end]
(reduce - (map f [end start])))
;; Okay, here's some practice. This might be a good candidate for a macro.
(defn little-int
"Converts four input bits to an int, in little endian format."
[b0 b1 b2 b3]
(bit-or
(bit-shift-left b3 24)
(bit-or
(bit-shift-left (bit-and b2 0xff) 16)
(bit-or
;; ## Example of timeseries aggregation in cascalog.
;;
;; (copy paste all of this in at the REPL!)
(use 'cascalog.api)
(def tseries [["ndvi" 1 0 [1 2 3 4]]
["ndvi" 1 2 [2 3 4 5]]
["ndvi" 1 1 [4 3 2 1]]
["ndvi" 1 4 [1 2 3 4]]
;; The goal here is to write an aggregator that takes in a sequence of
;; tuples of the form <tperiod, pixel-vector>, and returns tuples of
;; the form <pixel-index, min-time, max-time, timeseries>.
;;
;; We assume that we're receiving chunks for every month within the
;; range. We measure pixel-index as the position inside the chunk.
;;
;; Example:
;; (timeseries [[1 [7 4 2 1]]
;; [2 [1 2 3 4]]
;;old!
(ns forma.core
(:use cascalog.api
(clj-time [format :only (unparse formatters)]
[core :only (now)])
(forma [hadoop :only (all-files
template-seqfile
globhfs-seqfile)]))
(:require (cascalog [ops :as c])