Skip to content

Instantly share code, notes, and snippets.

@torgeir
Last active December 7, 2017 20:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save torgeir/98c83a6a7a9fdce55b31ce5f0eff555e to your computer and use it in GitHub Desktop.
Save torgeir/98c83a6a7a9fdce55b31ce5f0eff555e to your computer and use it in GitHub Desktop.
Script to diff two csv files and output the resulting diff to a csv file.
#!/usr/bin/env boot
;; https://github.com/boot-clj/boot#install
(set-env!
:dependencies '[[org.clojure/clojure "1.9.0-RC2"]
[org.clojure/data.csv "0.1.4"]])
(require '[clojure.data.csv :as csv]
'[clojure.java.io :as io])
(defn lazy-csv-seq [file separator]
"Parse csv and return a lazy seq of csv lines."
(let [reader (io/reader file)
csv-seq (csv/read-csv reader :separator separator)
lazy (fn lazy [wrapped]
(lazy-seq
(if-let [s (seq wrapped)]
(cons (first s) (lazy (rest s)))
(.close reader))))]
(lazy csv-seq)))
(defn lazy-read-csv [file & {:keys [separator]}]
"Parse csv into maps for each row, with headers as keys of each map."
(let [cells (lazy-csv-seq file (or separator \;))
headers (map keyword (first cells))]
{:headers headers
:rows (map #(zipmap headers %) (rest cells))}))
(defn to-double [v]
"Parse value to double, handling norwegian decimals. Returns nil if value
cannot be parsed to double."
(try
(-> (str v)
(clojure.string/replace "," ".")
Double/parseDouble)
(catch Exception _ nil)))
(defn try-parse-double [s]
"Attempt to parse value to double, or else return the value itself."
(or (to-double s) s))
(defn join-cells [& cells]
"Join multiple cells for display as one string."
(->> cells (clojure.string/join "|")))
(defmulti diff-cells
"Diff csv cells. Handles double and string type cells. Dispatches based on
cell contents."
(fn [& cells]
(cond
(every? double? cells) :double
(every? string? cells) :string
:else :not-same-type)))
(defmethod diff-cells :double [& cells]
"Diff one cell's double value with another's by subtraction."
(apply - cells))
(defmethod diff-cells :string [& cells]
"Diff one cell's string value with another's, outputting the value itself if
they are equal, or else a string to indicate a difference if they are
unequal."
(if (apply = cells)
(first cells)
(str "DIFF(" (join-cells cells) ")")))
(defmethod diff-cells :not-same-type [& cells]
"Diff two cells of unequal types, outputing a string to indicate the different
values."
(str "NOT-SAME-TYPE(" (join-cells cells) ")"))
(defn map-values [m f]
"Apply f to each value of the map, keeping its structure."
(into {} (for [[k v] m]
[k (f v)])))
(defn merge-rows [[row-a row-b]]
"Merge two row's cells by diffing each cell with its respective cell in the
other row, attempting to parse each cell to a double before computing diffs."
(merge-with diff-cells
(map-values row-a try-parse-double)
(map-values row-b try-parse-double)))
(defn diff-maps [headers & maps]
"Compute a map containing the difference of two maps."
(let [combined-rows (apply map vector maps)
diff-rows (map merge-rows combined-rows)]
(concat (list (map name headers))
(map (fn [row]
(map #(% row) headers))
diff-rows))))
(defn row-str [row]
"Convert each cell in the row to a string."
(mapv #(str \" % \") row))
(defn rows-to-csv [rows]
"Convert rows to csv row strings."
(->> rows
(mapv row-str)
(mapv (partial clojure.string/join ";"))
(clojure.string/join "\n")))
(defn diff-csvs [& csvs]
"Compute the difference between csvs. Throws if headers does not match."
(let [headers (mapv :headers csvs)]
(if (every? (partial apply =) (apply map vector headers))
(rows-to-csv (apply diff-maps (first headers) (map :rows csvs)))
(throw (RuntimeException.
(apply format "Headers does not match: %s vs %s" (map vec headers)))))))
(defn write-diff-csvs [csv-a csv-b out-csv]
"Write the difference between two csvs back to out-csv."
(spit out-csv (diff-csvs (lazy-read-csv csv-a)
(lazy-read-csv csv-b))))
(defn path-of [filename]
"Get the absolute path of a filename."
(-> filename io/file .getAbsolutePath))
(defn die [msg]
"Exit printing msg to stderr."
(binding [*out* *err*]
(println msg)
(System/exit 1)))
(defn -main [& args]
(when-not (= (count args) 3)
(die "usage: diff-csvs <file.csv> <file-to-diff.csv> <output-result.csv>"))
(apply write-diff-csvs (map path-of args))
(println (apply format "Wrote diff of %s and %s to %s." args)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment