Created
June 20, 2019 06:17
-
-
Save joinr/2b4ada4b73ab72873ac660b4ee9da091 to your computer and use it in GitHub Desktop.
messing with clojure to see if any more performance can squeeze out for perfect.reader.fast
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns perfect.reader.faster | |
(:require | |
[perfect.reader.generics :as generics] | |
[clojure.spec.alpha :as s] | |
[net.danielcompton.defn-spec-alpha :as ds]) | |
(:import | |
(org.dhatim.fastexcel.reader ReadableWorkbook | |
Sheet | |
Row | |
Cell | |
CellType | |
CellAddress))) | |
(defn ireducer [^java.util.Iterator iter] | |
(reify | |
java.lang.Iterable | |
(iterator [this] iter) | |
clojure.core.protocols/CollReduce | |
(coll-reduce [this f] | |
(when (.hasNext iter) | |
(clojure.core.protocols/coll-reduce this f (.next iter)))) | |
(coll-reduce [this f init] | |
(loop [acc init] | |
(cond (reduced? acc) | |
@acc | |
(.hasNext iter) | |
(recur (f acc (.next iter))) | |
:else acc))) | |
clojure.lang.Seqable | |
(seq [this] | |
(iterator-seq iter)))) | |
;;note: use of float here could be problematic.... | |
;;can create spooky comparisons. | |
;;floats are not = double, but print the same. | |
;;causes problems e.g. with set values, | |
;;and propogates. | |
;;better to coerce to double. | |
(defn cell-value | |
"Return the value of a cell using the proper method" | |
([^Cell cell] (cell-value cell (.getType cell))) | |
([^Cell cell cell-type] | |
(condp = cell-type | |
CellType/EMPTY nil | |
CellType/STRING (.asString cell) | |
CellType/NUMBER (float (.asNumber cell)) | |
CellType/BOOLEAN (.asBoolean cell) | |
CellType/FORMULA {:formula (.getFormula cell)} | |
CellType/ERROR {:error (.getValue cell)} | |
:unsupported))) | |
(defn cell-type | |
"From Excel types to keywords" | |
[cell] | |
(condp = cell | |
CellType/EMPTY :blank | |
CellType/STRING :str | |
CellType/NUMBER :numeric | |
CellType/BOOLEAN :bool | |
CellType/FORMULA :formula | |
CellType/ERROR :error | |
:unsupported)) | |
(defn sheets | |
[^ReadableWorkbook wb] | |
(iterator-seq (.. wb getSheets iterator))) | |
(ds/defn sheet | |
[^ReadableWorkbook wb | |
sheetid :- ::generics/sheet-identity] | |
(s/assert ::generics/sheet-identity sheetid) | |
(cond | |
(number? sheet) (.. wb (getSheet sheetid) get) | |
(string? sheet) (.. wb (findSheet sheetid) get))) | |
(defn rows | |
[^Sheet sheet] | |
(ireducer (.. sheet openStream iterator))) | |
(defn cells | |
[^Row row] | |
(ireducer (.iterator row))) | |
;;records are a little ugly, but they | |
;;are faster to construct even than | |
;;array maps. Out of curiosity, | |
;;how much faster are we with records? | |
(defrecord xlcell [type row-id col-id value]) | |
(defrecord xlrow [n-cells row-id cells]) | |
;;counterintuitively, this is counterintuitively slower. | |
;;maybe the lexical binds slow us down. | |
#_(defn ->cell [^Cell cell] | |
(when cell | |
(let [^CellAddress addr (.getAddress cell) | |
ctype (cell-type (.getType cell))] | |
(xlcell. ctype | |
(.getRow addr) | |
(.getColumn addr) | |
(cell-value cell ctype))))) | |
;;faster path ironically. | |
(defn ->cell [^Cell cell] | |
(when cell | |
(xlcell. (cell-type (.getType cell)) | |
(.getRow (.getAddress cell)) | |
(.getColumn (.getAddress cell)) | |
(cell-value cell)))) | |
;;inlining the transducer operations eliminates | |
;;some function call overhead, since we're doing this | |
;;the most, there are decent savings here. | |
;;I tried several variants (even using arraylists and mutation | |
;;without coercion to vectors), but this ends up being fastest | |
;;the hinted direct method calls also eliminate some overhead. | |
(defn ->row [^Row row] | |
(xlrow. (.getCellCount row) | |
(unchecked-dec (.getRowNum row)) | |
(->> (cells row) | |
(reduce (fn [^clojure.lang.PersistentVector$TransientVector acc v] | |
(let [v (->cell v)] | |
(if (and v (not (identical? :blank v))) | |
(.conj acc v) | |
acc))) | |
(transient [])) | |
persistent!))) | |
;;Started looking at parallel processing options. I think, due to the | |
;;streaming nature, we're not getting much out of this. We "may" get | |
;;something out of a parallel core.async workflow though, with some | |
;;aking to a work-stealing queue with multiple workers. That could | |
;;work in the streaming context. | |
(comment | |
(require 'clojure.core.reducers) | |
;;credit | |
;;https://labs.uswitch.com/transducers-from-the-ground-up-the-practice/ | |
(defn ptransduce [xform rf combinef coll] | |
(clojure.core.reducers/fold | |
512 | |
combinef | |
(xform rf) | |
(into [] coll))) | |
(defn add! [^java.util.ArrayList acc x] | |
(doto acc (.add x))) | |
(defn process-rows [sheet] | |
(ptransduce (comp | |
(map ->row) | |
(filter #(not-empty (:cells %)))) | |
add! | |
(fn ([] (java.util.ArrayList. 32)) | |
([l r] | |
(reduce add! l r))) | |
(rows sheet))) | |
;;too much coordination overhead. | |
#_(defn process-rows [sheet] | |
(->> (rows sheet) | |
(pmap ->row ) | |
(reduce (fn [^clojure.lang.PersistentVector$TransientVector acc x] | |
(if (not-empty (:cells x)) | |
(.conj acc x) | |
acc)) | |
(transient []) | |
) | |
(persistent!))) | |
(defn ->sheet [^Sheet sheet] | |
{:sheet-name (.getName sheet) | |
:sheet-id (.getIndex sheet) | |
:rows (process-rows sheet)}) | |
) | |
;;legacy | |
(defn ->sheet [^Sheet sheet] | |
{:sheet-name (.getName sheet) | |
:sheet-id (.getIndex sheet) | |
:rows (into [] | |
(comp | |
(map ->row) | |
(filter #(not-empty (:cells %)))) | |
(rows sheet))}) | |
(defn ->book | |
([^ReadableWorkbook wb] | |
(let [s (into [] (map ->sheet) (sheets wb)) | |
c (count s)] | |
{:sheet-count c | |
:sheets s})) | |
([^ReadableWorkbook wb sheetn] | |
(let [s (into [] (map ->sheet) (sheets wb)) | |
c (count s)] | |
{:sheet-count c | |
:sheets (->sheet (sheet wb sheetn))}))) | |
(defn read-workbook | |
"" | |
([path] | |
(with-open [ef (clojure.java.io/input-stream path)] | |
(->book (ReadableWorkbook. ef)))) | |
([path opt] | |
(with-open [ef (clojure.java.io/input-stream path)] | |
(->book (ReadableWorkbook. ef) opt)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment