mjwillson/mapreduce.clj

## mapreduce.clj
;; I'm sure I'm wrong here -- please correct me!
;; (Although note I'm not trying to capture the exact algorithms and dataflow complexities of hadoop here, just the logical structure of MapReduce computations and a rough sketch of how they're distributed.)

;; not just

(->> data
     (pmap mapper)
     (reduce reducer))

;; but something more like this.

(->> input-key-value-pairs
     ;; partition the input data (a sequence of key-value pairs) into a
     ;; chunk for each node
     (partition-by partitioner)
     ;; On each node:
     (pmap (fn [key-value-pairs]
             (->> key-value-pairs
                  ;; apply the mapper. mapper yields sequences of
                  ;; key-value pairs so it's a mapcat rather than a
                  ;; plain map
                  (mapcat mapper)
                  ;; group results locally by key (I think hadoop does
                  ;; this via a sort?)
                  (group-by :key)

                  ;; OPTIONALLY:
                  ;; reduce each pair of
                  ;;   [key, values-from-mapper]
                  ;; locally using a combiner.
                  ;; The fold isn't done in a flat sequential fashion, the combiner may be
                  ;; called again with outputs from previous combine steps. So should be
                  ;; associative (and the combiner/reducer operation should be commutative if using
                  ;; combiners, because there are no guarantees about the order in which
                  ;; combine results from different nodes are made available to the reducer node)
                  ;;
                  ;; (TODO: what about when a combiner produces multiple results for a key?)
                  (map (fn [[key values]]
                        [key (reducers/fold combiner values)])))

     ;; Join together the results from the separate nodes and again
     ;; group them by a hash of the key, which determines the node
     ;; responsible for reducing values of that key.
     ;;
     ;; (This distributing of map results to reducer
     ;; nodes doesn't actually go via a centralised grouping phase,
     ;; the dataflow is more distributed than this, but conceptually
     ;; speaking at least.)
     (apply concat)
     (group-by (fn [[key value]] (hash key)))

     ;; For each key hash bin, on a separate node:
     (pmap (fn [[hash key-value-pairs]]
             (->> key-value-pairs
                  ;; group results locally by key (or has this already
                  ;; been done?)
                  (group-by :key)
                  ;; Apply the reducer to each pair of
                  ;;  [key, values-from-combiners-on-all-nodes]
                  ;; The fold isn't done in a flat sequential fashion, reducer may be
                  ;; called again with outputs from previous reductions (and so should be
                  ;; associative?)
                  ;; (TODO: what about when a reducer produces multiple results for a key?)
                  (map (fn [[key values]]
                        [key (reducers/fold reducer values)])))))

     ;; Join together reducer results from all nodes (or can just
     ;; leave them as separate files in HDFS)
     (apply concat))
	;; I'm sure I'm wrong here -- please correct me!
	;; (Although note I'm not trying to capture the exact algorithms and dataflow complexities of hadoop here, just the logical structure of MapReduce computations and a rough sketch of how they're distributed.)

	;; not just

	(->> data
	(pmap mapper)
	(reduce reducer))

	;; but something more like this.

	(->> input-key-value-pairs
	;; partition the input data (a sequence of key-value pairs) into a
	;; chunk for each node
	(partition-by partitioner)
	;; On each node:
	(pmap (fn [key-value-pairs]
	(->> key-value-pairs
	;; apply the mapper. mapper yields sequences of
	;; key-value pairs so it's a mapcat rather than a
	;; plain map
	(mapcat mapper)
	;; group results locally by key (I think hadoop does
	;; this via a sort?)
	(group-by :key)

	;; OPTIONALLY:
	;; reduce each pair of
	;; [key, values-from-mapper]
	;; locally using a combiner.
	;; The fold isn't done in a flat sequential fashion, the combiner may be
	;; called again with outputs from previous combine steps. So should be
	;; associative (and the combiner/reducer operation should be commutative if using
	;; combiners, because there are no guarantees about the order in which
	;; combine results from different nodes are made available to the reducer node)
	;;
	;; (TODO: what about when a combiner produces multiple results for a key?)
	(map (fn [[key values]]
	[key (reducers/fold combiner values)])))

	;; Join together the results from the separate nodes and again
	;; group them by a hash of the key, which determines the node
	;; responsible for reducing values of that key.
	;;
	;; (This distributing of map results to reducer
	;; nodes doesn't actually go via a centralised grouping phase,
	;; the dataflow is more distributed than this, but conceptually
	;; speaking at least.)
	(apply concat)
	(group-by (fn [[key value]] (hash key)))

	;; For each key hash bin, on a separate node:
	(pmap (fn [[hash key-value-pairs]]
	(->> key-value-pairs
	;; group results locally by key (or has this already
	;; been done?)
	(group-by :key)
	;; Apply the reducer to each pair of
	;; [key, values-from-combiners-on-all-nodes]
	;; The fold isn't done in a flat sequential fashion, reducer may be
	;; called again with outputs from previous reductions (and so should be
	;; associative?)
	;; (TODO: what about when a reducer produces multiple results for a key?)
	(map (fn [[key values]]
	[key (reducers/fold reducer values)])))))

	;; Join together reducer results from all nodes (or can just
	;; leave them as separate files in HDFS)
	(apply concat))