Last active
December 7, 2017 20:46
-
-
Save torgeir/98c83a6a7a9fdce55b31ce5f0eff555e to your computer and use it in GitHub Desktop.
Script to diff two csv files and output the resulting diff to a csv file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env boot | |
;; https://github.com/boot-clj/boot#install | |
(set-env! | |
:dependencies '[[org.clojure/clojure "1.9.0-RC2"] | |
[org.clojure/data.csv "0.1.4"]]) | |
(require '[clojure.data.csv :as csv] | |
'[clojure.java.io :as io]) | |
(defn lazy-csv-seq [file separator] | |
"Parse csv and return a lazy seq of csv lines." | |
(let [reader (io/reader file) | |
csv-seq (csv/read-csv reader :separator separator) | |
lazy (fn lazy [wrapped] | |
(lazy-seq | |
(if-let [s (seq wrapped)] | |
(cons (first s) (lazy (rest s))) | |
(.close reader))))] | |
(lazy csv-seq))) | |
(defn lazy-read-csv [file & {:keys [separator]}] | |
"Parse csv into maps for each row, with headers as keys of each map." | |
(let [cells (lazy-csv-seq file (or separator \;)) | |
headers (map keyword (first cells))] | |
{:headers headers | |
:rows (map #(zipmap headers %) (rest cells))})) | |
(defn to-double [v] | |
"Parse value to double, handling norwegian decimals. Returns nil if value | |
cannot be parsed to double." | |
(try | |
(-> (str v) | |
(clojure.string/replace "," ".") | |
Double/parseDouble) | |
(catch Exception _ nil))) | |
(defn try-parse-double [s] | |
"Attempt to parse value to double, or else return the value itself." | |
(or (to-double s) s)) | |
(defn join-cells [& cells] | |
"Join multiple cells for display as one string." | |
(->> cells (clojure.string/join "|"))) | |
(defmulti diff-cells | |
"Diff csv cells. Handles double and string type cells. Dispatches based on | |
cell contents." | |
(fn [& cells] | |
(cond | |
(every? double? cells) :double | |
(every? string? cells) :string | |
:else :not-same-type))) | |
(defmethod diff-cells :double [& cells] | |
"Diff one cell's double value with another's by subtraction." | |
(apply - cells)) | |
(defmethod diff-cells :string [& cells] | |
"Diff one cell's string value with another's, outputting the value itself if | |
they are equal, or else a string to indicate a difference if they are | |
unequal." | |
(if (apply = cells) | |
(first cells) | |
(str "DIFF(" (join-cells cells) ")"))) | |
(defmethod diff-cells :not-same-type [& cells] | |
"Diff two cells of unequal types, outputing a string to indicate the different | |
values." | |
(str "NOT-SAME-TYPE(" (join-cells cells) ")")) | |
(defn map-values [m f] | |
"Apply f to each value of the map, keeping its structure." | |
(into {} (for [[k v] m] | |
[k (f v)]))) | |
(defn merge-rows [[row-a row-b]] | |
"Merge two row's cells by diffing each cell with its respective cell in the | |
other row, attempting to parse each cell to a double before computing diffs." | |
(merge-with diff-cells | |
(map-values row-a try-parse-double) | |
(map-values row-b try-parse-double))) | |
(defn diff-maps [headers & maps] | |
"Compute a map containing the difference of two maps." | |
(let [combined-rows (apply map vector maps) | |
diff-rows (map merge-rows combined-rows)] | |
(concat (list (map name headers)) | |
(map (fn [row] | |
(map #(% row) headers)) | |
diff-rows)))) | |
(defn row-str [row] | |
"Convert each cell in the row to a string." | |
(mapv #(str \" % \") row)) | |
(defn rows-to-csv [rows] | |
"Convert rows to csv row strings." | |
(->> rows | |
(mapv row-str) | |
(mapv (partial clojure.string/join ";")) | |
(clojure.string/join "\n"))) | |
(defn diff-csvs [& csvs] | |
"Compute the difference between csvs. Throws if headers does not match." | |
(let [headers (mapv :headers csvs)] | |
(if (every? (partial apply =) (apply map vector headers)) | |
(rows-to-csv (apply diff-maps (first headers) (map :rows csvs))) | |
(throw (RuntimeException. | |
(apply format "Headers does not match: %s vs %s" (map vec headers))))))) | |
(defn write-diff-csvs [csv-a csv-b out-csv] | |
"Write the difference between two csvs back to out-csv." | |
(spit out-csv (diff-csvs (lazy-read-csv csv-a) | |
(lazy-read-csv csv-b)))) | |
(defn path-of [filename] | |
"Get the absolute path of a filename." | |
(-> filename io/file .getAbsolutePath)) | |
(defn die [msg] | |
"Exit printing msg to stderr." | |
(binding [*out* *err*] | |
(println msg) | |
(System/exit 1))) | |
(defn -main [& args] | |
(when-not (= (count args) 3) | |
(die "usage: diff-csvs <file.csv> <file-to-diff.csv> <output-result.csv>")) | |
(apply write-diff-csvs (map path-of args)) | |
(println (apply format "Wrote diff of %s and %s to %s." args))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment