Skip to content

Instantly share code, notes, and snippets.

@joinr
Created June 20, 2019 06:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joinr/2b4ada4b73ab72873ac660b4ee9da091 to your computer and use it in GitHub Desktop.
Save joinr/2b4ada4b73ab72873ac660b4ee9da091 to your computer and use it in GitHub Desktop.
messing with clojure to see if any more performance can squeeze out for perfect.reader.fast
(ns perfect.reader.faster
(:require
[perfect.reader.generics :as generics]
[clojure.spec.alpha :as s]
[net.danielcompton.defn-spec-alpha :as ds])
(:import
(org.dhatim.fastexcel.reader ReadableWorkbook
Sheet
Row
Cell
CellType
CellAddress)))
(defn ireducer [^java.util.Iterator iter]
(reify
java.lang.Iterable
(iterator [this] iter)
clojure.core.protocols/CollReduce
(coll-reduce [this f]
(when (.hasNext iter)
(clojure.core.protocols/coll-reduce this f (.next iter))))
(coll-reduce [this f init]
(loop [acc init]
(cond (reduced? acc)
@acc
(.hasNext iter)
(recur (f acc (.next iter)))
:else acc)))
clojure.lang.Seqable
(seq [this]
(iterator-seq iter))))
;;note: use of float here could be problematic....
;;can create spooky comparisons.
;;floats are not = double, but print the same.
;;causes problems e.g. with set values,
;;and propogates.
;;better to coerce to double.
(defn cell-value
"Return the value of a cell using the proper method"
([^Cell cell] (cell-value cell (.getType cell)))
([^Cell cell cell-type]
(condp = cell-type
CellType/EMPTY nil
CellType/STRING (.asString cell)
CellType/NUMBER (float (.asNumber cell))
CellType/BOOLEAN (.asBoolean cell)
CellType/FORMULA {:formula (.getFormula cell)}
CellType/ERROR {:error (.getValue cell)}
:unsupported)))
(defn cell-type
"From Excel types to keywords"
[cell]
(condp = cell
CellType/EMPTY :blank
CellType/STRING :str
CellType/NUMBER :numeric
CellType/BOOLEAN :bool
CellType/FORMULA :formula
CellType/ERROR :error
:unsupported))
(defn sheets
[^ReadableWorkbook wb]
(iterator-seq (.. wb getSheets iterator)))
(ds/defn sheet
[^ReadableWorkbook wb
sheetid :- ::generics/sheet-identity]
(s/assert ::generics/sheet-identity sheetid)
(cond
(number? sheet) (.. wb (getSheet sheetid) get)
(string? sheet) (.. wb (findSheet sheetid) get)))
(defn rows
[^Sheet sheet]
(ireducer (.. sheet openStream iterator)))
(defn cells
[^Row row]
(ireducer (.iterator row)))
;;records are a little ugly, but they
;;are faster to construct even than
;;array maps. Out of curiosity,
;;how much faster are we with records?
(defrecord xlcell [type row-id col-id value])
(defrecord xlrow [n-cells row-id cells])
;;counterintuitively, this is counterintuitively slower.
;;maybe the lexical binds slow us down.
#_(defn ->cell [^Cell cell]
(when cell
(let [^CellAddress addr (.getAddress cell)
ctype (cell-type (.getType cell))]
(xlcell. ctype
(.getRow addr)
(.getColumn addr)
(cell-value cell ctype)))))
;;faster path ironically.
(defn ->cell [^Cell cell]
(when cell
(xlcell. (cell-type (.getType cell))
(.getRow (.getAddress cell))
(.getColumn (.getAddress cell))
(cell-value cell))))
;;inlining the transducer operations eliminates
;;some function call overhead, since we're doing this
;;the most, there are decent savings here.
;;I tried several variants (even using arraylists and mutation
;;without coercion to vectors), but this ends up being fastest
;;the hinted direct method calls also eliminate some overhead.
(defn ->row [^Row row]
(xlrow. (.getCellCount row)
(unchecked-dec (.getRowNum row))
(->> (cells row)
(reduce (fn [^clojure.lang.PersistentVector$TransientVector acc v]
(let [v (->cell v)]
(if (and v (not (identical? :blank v)))
(.conj acc v)
acc)))
(transient []))
persistent!)))
;;Started looking at parallel processing options. I think, due to the
;;streaming nature, we're not getting much out of this. We "may" get
;;something out of a parallel core.async workflow though, with some
;;aking to a work-stealing queue with multiple workers. That could
;;work in the streaming context.
(comment
(require 'clojure.core.reducers)
;;credit
;;https://labs.uswitch.com/transducers-from-the-ground-up-the-practice/
(defn ptransduce [xform rf combinef coll]
(clojure.core.reducers/fold
512
combinef
(xform rf)
(into [] coll)))
(defn add! [^java.util.ArrayList acc x]
(doto acc (.add x)))
(defn process-rows [sheet]
(ptransduce (comp
(map ->row)
(filter #(not-empty (:cells %))))
add!
(fn ([] (java.util.ArrayList. 32))
([l r]
(reduce add! l r)))
(rows sheet)))
;;too much coordination overhead.
#_(defn process-rows [sheet]
(->> (rows sheet)
(pmap ->row )
(reduce (fn [^clojure.lang.PersistentVector$TransientVector acc x]
(if (not-empty (:cells x))
(.conj acc x)
acc))
(transient [])
)
(persistent!)))
(defn ->sheet [^Sheet sheet]
{:sheet-name (.getName sheet)
:sheet-id (.getIndex sheet)
:rows (process-rows sheet)})
)
;;legacy
(defn ->sheet [^Sheet sheet]
{:sheet-name (.getName sheet)
:sheet-id (.getIndex sheet)
:rows (into []
(comp
(map ->row)
(filter #(not-empty (:cells %))))
(rows sheet))})
(defn ->book
([^ReadableWorkbook wb]
(let [s (into [] (map ->sheet) (sheets wb))
c (count s)]
{:sheet-count c
:sheets s}))
([^ReadableWorkbook wb sheetn]
(let [s (into [] (map ->sheet) (sheets wb))
c (count s)]
{:sheet-count c
:sheets (->sheet (sheet wb sheetn))})))
(defn read-workbook
""
([path]
(with-open [ef (clojure.java.io/input-stream path)]
(->book (ReadableWorkbook. ef))))
([path opt]
(with-open [ef (clojure.java.io/input-stream path)]
(->book (ReadableWorkbook. ef) opt))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment