Skip to content

Instantly share code, notes, and snippets.

@harold

harold/core.clj

Created Jul 20, 2020
Embed
What would you like to do?
(ns parallel-sum.core
(:require [tech.ml.dataset :as ds]
[tech.v2.datatype.functional :as dfn]
[tech.v2.datatype :as dt]))
(defn- produce-data-csv!
[]
(let [source-data (for [_ (range 1000000)]
{"letter" (+ 65 (rand-int 25))
"value" (rand-int 1000)})]
(ds/write-csv! (ds/->dataset source-data) "./data.csv")))
(defn- profile-sums!
[]
(let [ds (ds/->dataset "./data.csv")
mapseq (ds/mapseq-reader ds)
safe-adder (fnil + 0)]
(println "Naive:")
(time
(clojure.pprint/pprint
(->> (reduce (fn [eax {:strs [letter value]}]
(update eax letter safe-adder value))
{}
mapseq)
(sort-by first))))
(println "Dataset:")
(time
(clojure.pprint/pprint
(->> (ds/group-by-column "letter" ds)
(map (fn [[letter letter-ds]]
(let [value-col (dt/set-datatype (letter-ds "value") :int64)]
[letter (dfn/reduce-+ value-col)])))
(sort-by first))))))
(defproject parallel-sum "0.1.0-SNAPSHOT"
:description "FIXME: write description"
:url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.1"]
[techascent/tech.ml.dataset "3.07"]]
:repl-options {:init-ns parallel-sum.core})
parallel-sum.core> (produce-data-csv!)
nil
parallel-sum.core> (ds/->dataset "./data.csv")
./data.csv [1000000 2]:
| letter | value |
|--------|-------|
| 88 | 266 |
| 75 | 444 |
| 65 | 982 |
| 76 | 494 |
| 89 | 492 |
| 71 | 957 |
| 79 | 427 |
| 79 | 975 |
| 69 | 371 |
| 83 | 121 |
| 77 | 282 |
| 69 | 514 |
| 82 | 732 |
| 82 | 714 |
| 69 | 673 |
| 74 | 62 |
| 68 | 964 |
| 72 | 456 |
| 74 | 138 |
| 77 | 932 |
| 74 | 737 |
| 73 | 11 |
| 76 | 91 |
| 89 | 497 |
| 79 | 581 |
parallel-sum.core> (profile-sums!)
Naive:
([65 20103937]
[66 19949511]
[67 19887138]
[68 19687570]
[69 20099229]
[70 19826730]
[71 20090122]
[72 20137840]
[73 19921709]
[74 19915853]
[75 19777532]
[76 19958228]
[77 19925723]
[78 20073555]
[79 20020942]
[80 20126577]
[81 19983156]
[82 20227374]
[83 19928603]
[84 20018642]
[85 19825865]
[86 20142185]
[87 20110942]
[88 19800280]
[89 19804021])
"Elapsed time: 881.913549 msecs"
Dataset:
([65 20103937]
[66 19949511]
[67 19887138]
[68 19687570]
[69 20099229]
[70 19826730]
[71 20090122]
[72 20137840]
[73 19921709]
[74 19915853]
[75 19777532]
[76 19958228]
[77 19925723]
[78 20073555]
[79 20020942]
[80 20126577]
[81 19983156]
[82 20227374]
[83 19928603]
[84 20018642]
[85 19825865]
[86 20142185]
[87 20110942]
[88 19800280]
[89 19804021])
"Elapsed time: 48.433772 msecs"
nil
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment