Skip to content

Instantly share code, notes, and snippets.

@mccraigmccraig
Created September 27, 2017 10:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save mccraigmccraig/5207b5069bac7188fbb6dfce2d38c490 to your computer and use it in GitHub Desktop.
Save mccraigmccraig/5207b5069bac7188fbb6dfce2d38c490 to your computer and use it in GitHub Desktop.
(ns er-model.connectors.tsv
(:refer-clojure :exclude [line-seq])
(:require [clojure.java.io :as io]
[clojure.pprint :as pp]
[clojure.string :as str]
[clojure-csv.core :as csv]
[clojure.tools.logging :as log]))
(def ^:private lf \u000A)
(def ^:private cr \u000D)
(defn- char-seq
"lazy sequence of characters from a stream.
closes the stream when EOF is reached.
close the stream and re-throws if an Exception is thrown"
[^java.io.Reader ios]
(lazy-seq
(try
(let [c (.read ios)]
(if (>= c 0)
(cons (char c) (char-seq ios))
(do
(.close ios)
nil)))
(catch Throwable e
(.close ios)
(throw e)))))
(defn- partial-line-seq
"<CR><LF> and <LF> are treated as line-ends, but not <CR>[^<LF>], unless cr-eol is true,
since Excel exports embedded newlines as <CR>"
[line partial-line rem cr-eol]
(lazy-seq
(cons [line partial-line rem]
(cond
;; got a full line
line
(partial-line-seq nil [] rem cr-eol)
;; CRLF
(and (= (first rem) cr) (= (second rem) lf))
(partial-line-seq (str/join partial-line) [] (drop 2 rem) cr-eol)
;; LF
(= (first rem) lf)
(partial-line-seq (str/join partial-line) [] (rest rem) cr-eol)
;; CR if enabled
(and cr-eol (= (first rem) cr))
(partial-line-seq (str/join partial-line) [] (rest rem) cr-eol)
;; just another char
(not-empty rem)
(partial-line-seq nil (conj (or partial-line []) (first rem)) (rest rem) cr-eol)
;; last line
(not-empty partial-line)
(partial-line-seq (str/join partial-line) [] nil cr-eol)
;; EOF
true
nil))))
(defn- line-seq
"produce a lazy-seq of lines from a seq of chars.
<CR><LF> and <LF> are treated as line-ends, but not <CR>[^<LF>], since Excel exports
embedded newlines as <CR>"
[char-seq & [{:keys [cr-eol]}]]
(->> (partial-line-seq nil nil char-seq cr-eol)
(map first)
(filter identity)))
(defn- open-lineseq
"open a file, return a lazy sequence of it's lines
f - the file / filename
encoding - the optional encoding, which defaults to UTF-8"
[f & {:keys [encoding cr-eol]}]
(-> f
(io/reader :encoding encoding)
char-seq
(line-seq {:cr-eol cr-eol})))
(defn header-key
"make a string from a column header into a record key :
trim, lower-case, remove quotes,
remove comments inside parentheses,
turn spaces to - and turn to a keyword"
[s]
(some-> s
str/trim
str/lower-case
(str/replace #"\(.*\)" " ")
(str/replace #"^\"" "")
(str/replace #"\"$" "")
(str/replace #"^\s+" "")
(str/replace #"\s+$" "")
(str/replace #"\s+" "-")
(str/replace #"[^\s\p{Alnum}_-]" "")
keyword))
(defn row->header-keys
[row]
(let [cols (some-> row (str/split #"\t"))]
(->> cols
(map header-key)
(into []))))
(defn- excel-escape
"Opening a file containig values like \"01-01-1999\" in Excel
will turn them into numbers which which will break on import.
We prepend the underscore char to prevent this."
[v]
(if (and (string? v) (re-find #"^[\d+-.,]+$" v))
(str "_" v)
v))
(defn- undo-excel-escape
"See `excel-escape`"
[v]
(if (and (string? v) (re-find #"^_[\d+-.,]+$" v))
(subs v 1)
v))
(defn parse-rows
"parse rows, where each row is a vector of column values.
will happily work with the output of clojure-csv :)"
([rows] (parse-rows rows {}))
([rows {:keys [col-keys skip-rows col-parsers escaped-for-excel?]}]
(let [keys (or col-keys (->> (first rows) (map header-key) (into [])))
data (if col-keys rows (rest rows))
data (if escaped-for-excel?
(for [row data] (map undo-excel-escape row))
data)
recordf (fn [row] (reduce (fn [r [key col]]
(let [cp (or (get col-parsers key) identity)
val (cp col)]
(if key
(assoc r key (if (= "" val) nil val))
r)))
{}
(map vector keys row)))]
(map recordf data))))
(defn- parse-tsv
([seq-of-lines {:keys [col-keys skip-rows col-parsers escaped-for-excel?] :as opts}]
(let [rows (->> seq-of-lines
(map (fn [l] (->> (str/split l #"\t")
(map str/trim)
(map not-empty))))
(drop (or skip-rows 0)))]
(parse-rows rows opts))))
(defn parse-tsv-string
([s opts]
(parse-tsv (str/split s #"\n") opts)))
(defn parse-tsv-file
"parse a tsv file into a lazy sequence of record hashes
- f : the tsv file
- encoding : optional file encoding, defaults to UTF-8
- skip-rows : number of header rows to skip
- col-keys : optional seq of column record keys. defaults
to using first row of data for keys
- col-parsers : optional map of {col parser-fn} to parse
column data
- cr-eol : treat CR as an EOL character
- escaped-for-excel? : remove a prepended underscore char
from number-like values."
([f] (parse-tsv-file f {}))
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol] :as opts}]
(let [lseq (open-lineseq f :encoding encoding :cr-eol cr-eol)]
(parse-tsv lseq opts))))
(defn parse-tsv-col-keys
([f] (parse-tsv-col-keys f {}))
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol] :as opts}]
(let [lseq (open-lineseq f :encoding encoding :cr-eol cr-eol)]
(-> lseq
first
row->header-keys))))
(defn- tsv-line
[col-keys record opts]
(let [excel-escape? (:escaped-for-excel? opts)
escape-fn (if excel-escape?
excel-escape
identity)
col-vals (->> col-keys
(map #(get record %))
(map str)
(map escape-fn))]
(interpose "\t" col-vals)))
(defn- tsv-lines
[col-keys records opts]
(->> records
(map (fn [r] (tsv-line col-keys r opts)))
(interpose ["\n"])
(apply concat)))
(defn tsv-record-writer
"return a function which will write a record as TSV
to the output writer on each invocation
when header is truthy will write column headers before
the first invocation"
[out {:keys [col-keys header] :as opts}]
(when header
(prn "header " col-keys)
(doseq [c (interpose "\t" (map name col-keys))]
(.write out c))
(.write out "\n"))
(fn [record]
(let [cols (tsv-line col-keys record opts)]
(prn "LINE: " cols)
(doseq [c cols]
(.write out c))
(.write out "\n"))))
(defn write-tsv-writer
"write TSV to a supplied writer"
([out records] (write-tsv-writer out records {}))
([out records {:keys [encoding col-keys header] :as opts}]
(let [cols (or col-keys (keys (first records)))
lines (tsv-lines cols records opts)]
(when header
(prn "header " cols)
(doseq [c (interpose "\t" (map name cols))]
(.write out c))
(.write out "\n"))
(doseq [l lines]
(.write out l)))))
(defn write-tsv-file
([f records] (write-tsv-file f records {}))
([f records {:keys [encoding] :as opts}]
(with-open [out (io/writer f :encoding (or encoding "UTF-8"))]
(write-tsv-writer out records opts))))
(defn- normalize-eol-to-temp-file
[f encoding]
(let [lines (open-lineseq f :encoding encoding :cr-eol true)
tmp-file (java.io.File/createTempFile "csv-import" ".tmp")]
(with-open [w (io/writer tmp-file :encoding encoding)]
(doseq [line lines]
(.write w line)
(.newLine w)))
tmp-file))
(defn parse-csv-file
"like parse-tsv-file, but for csv
`:escaped-for-excel?` (boolean) - remove a prepended underscore char from number-like values."
([f] (parse-csv-file f {}))
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol escaped-for-excel?] :as opts}]
(let [tmp-file (when cr-eol
(normalize-eol-to-temp-file f encoding))]
(try
(-> (if cr-eol tmp-file f)
(io/reader :encoding encoding)
csv/parse-csv
(parse-rows opts))
(finally
(when tmp-file
(log/debug "tmp file deleted?:" (.delete tmp-file))))))))
(defn write-csv-file
([f records] (write-csv-file f records {}))
([f records {:keys [encoding col-keys header escaped-for-excel?] :or {header true}}]
(let [cols (or col-keys (keys (first records)))
table (->> records
(map (fn [r] (mapv #(get r %) cols))))
table (if escaped-for-excel?
(map (fn [r] (map excel-escape r)) table)
table)
table (map (fn [r]
;; clojure-csv borks on nil columns!
(map (fn [v] (if (some? v) v "")) r))
table)
lines (->> table
(map (fn [l]
(csv/write-csv [l] :force-quote false))))]
(with-open [out (io/writer f :encoding (or encoding "UTF-8"))]
(when header
(doseq [c (interpose "," (map #(str "" (name %) "") cols))]
(.write out c))
(.write out "\n"))
(doseq [l lines]
(.write out l))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment