joinr/iotaproblem.clj

## iotaproblem.clj
(ns sometest
  (:require [iota :as iota]
	    [clojure.core.reducers :as r]))

;;==Synthetic data==
(def fillrecord {:Unit      :text
                 :category :text
                 :DemandGroup :text
                 :SRC :text
                 :FillType :text
                 :FollowOn :boolean
                 :name :text
                 :Component :text
                 :operation :text
                 :start :int
                 :DeploymentID :int
                 :duration :int
                 :dwell-plot? :boolean
                 :DwellYearsBeforeDeploy :float
                 :DeployDate :text
                 :FollowOnCount :int
                 :AtomicPolicy :text
                 :Category :text
                 :DeployInterval :int
                 :fill-type :text
                 :FillPath :text
                 :Period :text
                 :unitid :int
                 :deltat :int
                 :Demand :text
                 :PathLength :int
                 :OITitle :text
                 :BogBudget :int
                 :CycleTime :int
                 :DeploymentCount :int
                 :DemandType :text
                 :quantity :int
                 :end :int
                 :FillCount :int
                 :Location :text
                 :location :text
                 :compo :text
                 :DwellBeforeDeploy :int
                 :Policy :text
                 :sampled :boolean
                 })

(defn fld->val [fld t]
  (case t
    :text (str (gensym fld))
    :boolean (rand-nth [true false])
    :int    (rand-int 50000)
    :float  (* (rand) 50000)))
;;generate random records from the schema.
(defn fake-record
  []
  (reduce-kv (fn [acc k t]
               (assoc acc k
                      (fld->val k t)))
             fillrecord
             fillrecord))
;;Using the preceding functions, we dump 4.395605X10^6 records to
;;a file at.
(def testfile "c:/path/to/bigfile.txt")
;;dumping elided for brevity...end result is a ~2GB tabdelimited
;;text file with headers in the first line.
(defn records->file [path] ...)


;;Testfile has these characteristics:
;;4.3x10^6 newline-separated lines of
;;490 utf-8 Chars /line
;;40 tab-delimited fields (although it doesn't matter for this).
;;2GB in size.

;;==Simple Test - Count the lines of the file==

;;Simply increment an int, occasionally printing out
;;every 100000th line for status.
(defn count-lines [r]
  (reduce (fn [acc _]
            (do (when (zero? (rem acc 100000))
                  (println acc))
                (unchecked-inc acc)))
          0
          r))
;;Time how long it takes us to traverse and count the file.
;;overhead from printing is infintessimal, so this should
;;simply walk the file(seq).
(defn test  [path & {:keys [n]}]
  (time (count-lines (if (iota/seq path n)
                       (iota/seq path)))))

;;==specs==
;;Machine (laptop) is underspecced:
;;Intel Core2 T7200 @ 2.00 ghz
;;2GB Ram (this hurts :( )
;;Windowd7 64bit, on 64 bit JVM
;;NOT running -server in jvm opts (clients can't have jdk currently).

;;==results==
;;This chokes entirely...doesn't even get to 0 within minutes
(test testfile)
;;I saw the default buffer size in FileSeq is pretty big,
;;so
(test testfile :n (* 4 1024))
;;Cooks along nicely, right up until about 3x10^6, then
;;starts chugging.  Looks like GC goes through the roof, although
;;we never grow the heap....(never go above 490mb or so, despite
;;using -Xmx1g)

;;run completes in 291443 ms

;;visualvm shows most of the time is spent grinding in MMap/get
;;(understandable from glancing at the source).

;;GC behavior is really wierd to me though, can't help but think it'd
;;be fine if iota would take advantage of the available heap.  Maybe
;;this has something to do with filechannel? We get a jagged garbage
;;collection trail, hovering under 500MB without ever growing the heap.


;;==Alternatives==
;;--FastUtil
;;In contrast, using it.unimi.dsi.fastutil.io.FastBufferedReader
;;from FastUtil library, wrapped in a clojure compatible reducer,
;;I churn through the same file pretty easily.
;;It's a little slower out the gates (in terms of visible printing/traversal)
;;BUT it's consistent, and completes in 63145 ms.
;;Notable differences: FastBufferedReader is unsynchronized (dunno if this
;;affects the aforementioned observations).  I'd have to line-by-line the
;;source to see any other differences.  FastBufferedReader is not using
;;file channels by default.

;;We get a nice saw-tooth GC collection profile, peaking at max usage of
;;~583mb.  Heap is grown to 715mb.

;;--MMap
;;Also, I implemented an mmap version based off Eric Rochester's example at:
;;http://www.ericrochester.com/pages/code/parallel-io-with-mmap/
;;This version is a bit slower than the FastBufferedReader, but it works.
;;It ues nio/mmap and a (* 10 1024 1024) buffer.  Job completes in 84472ms.
;;Expands the heap during processing beyond 500mb to 779mb.
;;We get a higher-frequency gc wave, with peak usage at 668mb.

;;Both the mmap and fastutil implementations are actually returning strings
;;as part of their .readLine implementation, so some of the gc is probably
;;due to string creation.

;;==Summary==
;;Both alternative implementations finish the job much faster than iota/seq.
;;Both actually take advantage of the available heap (although they don't
;;come near exhausting it.  They are also coercing lines to strings in the
;;process.

;;Why is iota/seq sucking wind on this?
	(ns sometest
	(:require [iota :as iota]
	[clojure.core.reducers :as r]))

	;;==Synthetic data==
	(def fillrecord {:Unit :text
	:category :text
	:DemandGroup :text
	:SRC :text
	:FillType :text
	:FollowOn :boolean
	:name :text
	:Component :text
	:operation :text
	:start :int
	:DeploymentID :int
	:duration :int
	:dwell-plot? :boolean
	:DwellYearsBeforeDeploy :float
	:DeployDate :text
	:FollowOnCount :int
	:AtomicPolicy :text
	:Category :text
	:DeployInterval :int
	:fill-type :text
	:FillPath :text
	:Period :text
	:unitid :int
	:deltat :int
	:Demand :text
	:PathLength :int
	:OITitle :text
	:BogBudget :int
	:CycleTime :int
	:DeploymentCount :int
	:DemandType :text
	:quantity :int
	:end :int
	:FillCount :int
	:Location :text
	:location :text
	:compo :text
	:DwellBeforeDeploy :int
	:Policy :text
	:sampled :boolean
	})

	(defn fld->val [fld t]
	(case t
	:text (str (gensym fld))
	:boolean (rand-nth [true false])
	:int (rand-int 50000)
	:float (* (rand) 50000)))
	;;generate random records from the schema.
	(defn fake-record
	[]
	(reduce-kv (fn [acc k t]
	(assoc acc k
	(fld->val k t)))
	fillrecord
	fillrecord))
	;;Using the preceding functions, we dump 4.395605X10^6 records to
	;;a file at.
	(def testfile "c:/path/to/bigfile.txt")
	;;dumping elided for brevity...end result is a ~2GB tabdelimited
	;;text file with headers in the first line.
	(defn records->file [path] ...)


	;;Testfile has these characteristics:
	;;4.3x10^6 newline-separated lines of
	;;490 utf-8 Chars /line
	;;40 tab-delimited fields (although it doesn't matter for this).
	;;2GB in size.

	;;==Simple Test - Count the lines of the file==

	;;Simply increment an int, occasionally printing out
	;;every 100000th line for status.
	(defn count-lines [r]
	(reduce (fn [acc _]
	(do (when (zero? (rem acc 100000))
	(println acc))
	(unchecked-inc acc)))
	0
	r))
	;;Time how long it takes us to traverse and count the file.
	;;overhead from printing is infintessimal, so this should
	;;simply walk the file(seq).
	(defn test [path & {:keys [n]}]
	(time (count-lines (if (iota/seq path n)
	(iota/seq path)))))

	;;==specs==
	;;Machine (laptop) is underspecced:
	;;Intel Core2 T7200 @ 2.00 ghz
	;;2GB Ram (this hurts :( )
	;;Windowd7 64bit, on 64 bit JVM
	;;NOT running -server in jvm opts (clients can't have jdk currently).

	;;==results==
	;;This chokes entirely...doesn't even get to 0 within minutes
	(test testfile)
	;;I saw the default buffer size in FileSeq is pretty big,
	;;so
	(test testfile :n (* 4 1024))
	;;Cooks along nicely, right up until about 3x10^6, then
	;;starts chugging. Looks like GC goes through the roof, although
	;;we never grow the heap....(never go above 490mb or so, despite
	;;using -Xmx1g)

	;;run completes in 291443 ms

	;;visualvm shows most of the time is spent grinding in MMap/get
	;;(understandable from glancing at the source).

	;;GC behavior is really wierd to me though, can't help but think it'd
	;;be fine if iota would take advantage of the available heap. Maybe
	;;this has something to do with filechannel? We get a jagged garbage
	;;collection trail, hovering under 500MB without ever growing the heap.


	;;==Alternatives==
	;;--FastUtil
	;;In contrast, using it.unimi.dsi.fastutil.io.FastBufferedReader
	;;from FastUtil library, wrapped in a clojure compatible reducer,
	;;I churn through the same file pretty easily.
	;;It's a little slower out the gates (in terms of visible printing/traversal)
	;;BUT it's consistent, and completes in 63145 ms.
	;;Notable differences: FastBufferedReader is unsynchronized (dunno if this
	;;affects the aforementioned observations). I'd have to line-by-line the
	;;source to see any other differences. FastBufferedReader is not using
	;;file channels by default.

	;;We get a nice saw-tooth GC collection profile, peaking at max usage of
	;;~583mb. Heap is grown to 715mb.

	;;--MMap
	;;Also, I implemented an mmap version based off Eric Rochester's example at:
	;;http://www.ericrochester.com/pages/code/parallel-io-with-mmap/
	;;This version is a bit slower than the FastBufferedReader, but it works.
	;;It ues nio/mmap and a (* 10 1024 1024) buffer. Job completes in 84472ms.
	;;Expands the heap during processing beyond 500mb to 779mb.
	;;We get a higher-frequency gc wave, with peak usage at 668mb.

	;;Both the mmap and fastutil implementations are actually returning strings
	;;as part of their .readLine implementation, so some of the gc is probably
	;;due to string creation.

	;;==Summary==
	;;Both alternative implementations finish the job much faster than iota/seq.
	;;Both actually take advantage of the available heap (although they don't
	;;come near exhausting it. They are also coercing lines to strings in the
	;;process.

	;;Why is iota/seq sucking wind on this?