Skip to content

Instantly share code, notes, and snippets.

@genmeblog
Created March 6, 2020 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save genmeblog/9da71e2750f95cbc724baefbbfc813dd to your computer and use it in GitHub Desktop.
Save genmeblog/9da71e2750f95cbc724baefbbfc813dd to your computer and use it in GitHub Desktop.
Tablesaw join test
(ns tablesaw-test
(:import [tech.tablesaw.io.csv CsvReadOptions CsvReader]))
(def ^CsvReader csv-reader (CsvReader.))
(defn load-csv-data
([file] (load-csv-data file nil))
([^String file {:keys [separator line-ending header?]
:or {separator \, line-ending "\n" header? true}}]
(let [builder (doto (CsvReadOptions/builder file)
(.separator separator)
(.lineEnding line-ending)
(.header header?))]
(->> builder
(.build)
(.read csv-reader)))))
(def lhs-fields
[:size :day :operatorid :notes :more-notes :even-more-notes :how-can-there-be-more])
(defn customers []
(for [i (range 100000)]
(let [city (str (rand-int 10))]
{:address (str "Address" i)
:gender (rand-nth ["m" "f" "n"])
:address-id i
:country-code "99"
:first-name (str "customer_" i "first")
:last-name (str "customer_" i "last")
:city city
:zip-code (clojure.string/join (repeat 5 city))
:email (str "customer_" i "@the-net")
:huge-field (str "this is a huge field containing a lot of dumb info for
bloat which will make the file so much larger for our poor machine how
unkind of us to do so in this day and age" i)})))
(def rhs-fields
[:operatorid
:address
:gender
:address-id
:country-code
:first-name
:last-name
:city
:zip-code
:email])
(defn random-lhs []
(for [i (range 200000)]
{:size (rand-nth ["s" "m" "l"])
:day (str (rand-int 100000))
:operatorid (str "op" (rand-int 10000) "op")
:notes "THis is some bloated information we'll add in"
:more-notes "to make the table larger"
:even-more-notes "Also this will make things big as well"
:how-can-there-be-more "Yet another text field will add overhead jabroni"}))
(defn random-rhs []
(let [cs (vec (customers))]
(for [i (range 500000)]
(let [c (rand-nth cs)]
(assoc c :operatorid (str "op" (rand-int 10000) "op"))))))
(with-open [w (clojure.java.io/writer "lhs.csv")]
(.write w (str (clojure.string/join "," (map name lhs-fields)) "\n"))
(run! (comp #(.write w (str % "\n"))
(partial clojure.string/join ",")
(apply juxt lhs-fields)) (random-lhs)))
(with-open [w (clojure.java.io/writer "rhs.csv")]
(.write w (str (clojure.string/join "," (map name rhs-fields)) "\n"))
(run! (comp #(.write w (str % "\n"))
(partial clojure.string/join ",")
(apply juxt rhs-fields)) (random-rhs)))
;; 435 ms
(time (def lhs (load-csv-data "lhs.csv")))
;; 469 ms
(time (def rhs (load-csv-data "rhs.csv")))
;; need to convert operatorid id to String (originally textcolumn is created)
(def lhs (.replaceColumn lhs "operatorid" (.asStringColumn (.column lhs "operatorid"))))
(def rhs (.replaceColumn rhs "operatorid" (.asStringColumn (.column rhs "operatorid"))))
;; 40 s
(time (def result (-> lhs
(.joinOn (into-array String ["operatorid"]))
(.inner rhs "operatorid"))))
(.rowCount result)
;; => 9999083
(def lhs nil)
(def rhs nil)
(def result nil)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment