-
-
Save mccraigmccraig/5207b5069bac7188fbb6dfce2d38c490 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns er-model.connectors.tsv | |
(:refer-clojure :exclude [line-seq]) | |
(:require [clojure.java.io :as io] | |
[clojure.pprint :as pp] | |
[clojure.string :as str] | |
[clojure-csv.core :as csv] | |
[clojure.tools.logging :as log])) | |
(def ^:private lf \u000A) | |
(def ^:private cr \u000D) | |
(defn- char-seq | |
"lazy sequence of characters from a stream. | |
closes the stream when EOF is reached. | |
close the stream and re-throws if an Exception is thrown" | |
[^java.io.Reader ios] | |
(lazy-seq | |
(try | |
(let [c (.read ios)] | |
(if (>= c 0) | |
(cons (char c) (char-seq ios)) | |
(do | |
(.close ios) | |
nil))) | |
(catch Throwable e | |
(.close ios) | |
(throw e))))) | |
(defn- partial-line-seq | |
"<CR><LF> and <LF> are treated as line-ends, but not <CR>[^<LF>], unless cr-eol is true, | |
since Excel exports embedded newlines as <CR>" | |
[line partial-line rem cr-eol] | |
(lazy-seq | |
(cons [line partial-line rem] | |
(cond | |
;; got a full line | |
line | |
(partial-line-seq nil [] rem cr-eol) | |
;; CRLF | |
(and (= (first rem) cr) (= (second rem) lf)) | |
(partial-line-seq (str/join partial-line) [] (drop 2 rem) cr-eol) | |
;; LF | |
(= (first rem) lf) | |
(partial-line-seq (str/join partial-line) [] (rest rem) cr-eol) | |
;; CR if enabled | |
(and cr-eol (= (first rem) cr)) | |
(partial-line-seq (str/join partial-line) [] (rest rem) cr-eol) | |
;; just another char | |
(not-empty rem) | |
(partial-line-seq nil (conj (or partial-line []) (first rem)) (rest rem) cr-eol) | |
;; last line | |
(not-empty partial-line) | |
(partial-line-seq (str/join partial-line) [] nil cr-eol) | |
;; EOF | |
true | |
nil)))) | |
(defn- line-seq | |
"produce a lazy-seq of lines from a seq of chars. | |
<CR><LF> and <LF> are treated as line-ends, but not <CR>[^<LF>], since Excel exports | |
embedded newlines as <CR>" | |
[char-seq & [{:keys [cr-eol]}]] | |
(->> (partial-line-seq nil nil char-seq cr-eol) | |
(map first) | |
(filter identity))) | |
(defn- open-lineseq | |
"open a file, return a lazy sequence of it's lines | |
f - the file / filename | |
encoding - the optional encoding, which defaults to UTF-8" | |
[f & {:keys [encoding cr-eol]}] | |
(-> f | |
(io/reader :encoding encoding) | |
char-seq | |
(line-seq {:cr-eol cr-eol}))) | |
(defn header-key | |
"make a string from a column header into a record key : | |
trim, lower-case, remove quotes, | |
remove comments inside parentheses, | |
turn spaces to - and turn to a keyword" | |
[s] | |
(some-> s | |
str/trim | |
str/lower-case | |
(str/replace #"\(.*\)" " ") | |
(str/replace #"^\"" "") | |
(str/replace #"\"$" "") | |
(str/replace #"^\s+" "") | |
(str/replace #"\s+$" "") | |
(str/replace #"\s+" "-") | |
(str/replace #"[^\s\p{Alnum}_-]" "") | |
keyword)) | |
(defn row->header-keys | |
[row] | |
(let [cols (some-> row (str/split #"\t"))] | |
(->> cols | |
(map header-key) | |
(into [])))) | |
(defn- excel-escape | |
"Opening a file containig values like \"01-01-1999\" in Excel | |
will turn them into numbers which which will break on import. | |
We prepend the underscore char to prevent this." | |
[v] | |
(if (and (string? v) (re-find #"^[\d+-.,]+$" v)) | |
(str "_" v) | |
v)) | |
(defn- undo-excel-escape | |
"See `excel-escape`" | |
[v] | |
(if (and (string? v) (re-find #"^_[\d+-.,]+$" v)) | |
(subs v 1) | |
v)) | |
(defn parse-rows | |
"parse rows, where each row is a vector of column values. | |
will happily work with the output of clojure-csv :)" | |
([rows] (parse-rows rows {})) | |
([rows {:keys [col-keys skip-rows col-parsers escaped-for-excel?]}] | |
(let [keys (or col-keys (->> (first rows) (map header-key) (into []))) | |
data (if col-keys rows (rest rows)) | |
data (if escaped-for-excel? | |
(for [row data] (map undo-excel-escape row)) | |
data) | |
recordf (fn [row] (reduce (fn [r [key col]] | |
(let [cp (or (get col-parsers key) identity) | |
val (cp col)] | |
(if key | |
(assoc r key (if (= "" val) nil val)) | |
r))) | |
{} | |
(map vector keys row)))] | |
(map recordf data)))) | |
(defn- parse-tsv | |
([seq-of-lines {:keys [col-keys skip-rows col-parsers escaped-for-excel?] :as opts}] | |
(let [rows (->> seq-of-lines | |
(map (fn [l] (->> (str/split l #"\t") | |
(map str/trim) | |
(map not-empty)))) | |
(drop (or skip-rows 0)))] | |
(parse-rows rows opts)))) | |
(defn parse-tsv-string | |
([s opts] | |
(parse-tsv (str/split s #"\n") opts))) | |
(defn parse-tsv-file | |
"parse a tsv file into a lazy sequence of record hashes | |
- f : the tsv file | |
- encoding : optional file encoding, defaults to UTF-8 | |
- skip-rows : number of header rows to skip | |
- col-keys : optional seq of column record keys. defaults | |
to using first row of data for keys | |
- col-parsers : optional map of {col parser-fn} to parse | |
column data | |
- cr-eol : treat CR as an EOL character | |
- escaped-for-excel? : remove a prepended underscore char | |
from number-like values." | |
([f] (parse-tsv-file f {})) | |
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol] :as opts}] | |
(let [lseq (open-lineseq f :encoding encoding :cr-eol cr-eol)] | |
(parse-tsv lseq opts)))) | |
(defn parse-tsv-col-keys | |
([f] (parse-tsv-col-keys f {})) | |
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol] :as opts}] | |
(let [lseq (open-lineseq f :encoding encoding :cr-eol cr-eol)] | |
(-> lseq | |
first | |
row->header-keys)))) | |
(defn- tsv-line | |
[col-keys record opts] | |
(let [excel-escape? (:escaped-for-excel? opts) | |
escape-fn (if excel-escape? | |
excel-escape | |
identity) | |
col-vals (->> col-keys | |
(map #(get record %)) | |
(map str) | |
(map escape-fn))] | |
(interpose "\t" col-vals))) | |
(defn- tsv-lines | |
[col-keys records opts] | |
(->> records | |
(map (fn [r] (tsv-line col-keys r opts))) | |
(interpose ["\n"]) | |
(apply concat))) | |
(defn tsv-record-writer | |
"return a function which will write a record as TSV | |
to the output writer on each invocation | |
when header is truthy will write column headers before | |
the first invocation" | |
[out {:keys [col-keys header] :as opts}] | |
(when header | |
(prn "header " col-keys) | |
(doseq [c (interpose "\t" (map name col-keys))] | |
(.write out c)) | |
(.write out "\n")) | |
(fn [record] | |
(let [cols (tsv-line col-keys record opts)] | |
(prn "LINE: " cols) | |
(doseq [c cols] | |
(.write out c)) | |
(.write out "\n")))) | |
(defn write-tsv-writer | |
"write TSV to a supplied writer" | |
([out records] (write-tsv-writer out records {})) | |
([out records {:keys [encoding col-keys header] :as opts}] | |
(let [cols (or col-keys (keys (first records))) | |
lines (tsv-lines cols records opts)] | |
(when header | |
(prn "header " cols) | |
(doseq [c (interpose "\t" (map name cols))] | |
(.write out c)) | |
(.write out "\n")) | |
(doseq [l lines] | |
(.write out l))))) | |
(defn write-tsv-file | |
([f records] (write-tsv-file f records {})) | |
([f records {:keys [encoding] :as opts}] | |
(with-open [out (io/writer f :encoding (or encoding "UTF-8"))] | |
(write-tsv-writer out records opts)))) | |
(defn- normalize-eol-to-temp-file | |
[f encoding] | |
(let [lines (open-lineseq f :encoding encoding :cr-eol true) | |
tmp-file (java.io.File/createTempFile "csv-import" ".tmp")] | |
(with-open [w (io/writer tmp-file :encoding encoding)] | |
(doseq [line lines] | |
(.write w line) | |
(.newLine w))) | |
tmp-file)) | |
(defn parse-csv-file | |
"like parse-tsv-file, but for csv | |
`:escaped-for-excel?` (boolean) - remove a prepended underscore char from number-like values." | |
([f] (parse-csv-file f {})) | |
([f {:keys [encoding col-keys skip-rows col-parsers cr-eol escaped-for-excel?] :as opts}] | |
(let [tmp-file (when cr-eol | |
(normalize-eol-to-temp-file f encoding))] | |
(try | |
(-> (if cr-eol tmp-file f) | |
(io/reader :encoding encoding) | |
csv/parse-csv | |
(parse-rows opts)) | |
(finally | |
(when tmp-file | |
(log/debug "tmp file deleted?:" (.delete tmp-file)))))))) | |
(defn write-csv-file | |
([f records] (write-csv-file f records {})) | |
([f records {:keys [encoding col-keys header escaped-for-excel?] :or {header true}}] | |
(let [cols (or col-keys (keys (first records))) | |
table (->> records | |
(map (fn [r] (mapv #(get r %) cols)))) | |
table (if escaped-for-excel? | |
(map (fn [r] (map excel-escape r)) table) | |
table) | |
table (map (fn [r] | |
;; clojure-csv borks on nil columns! | |
(map (fn [v] (if (some? v) v "")) r)) | |
table) | |
lines (->> table | |
(map (fn [l] | |
(csv/write-csv [l] :force-quote false))))] | |
(with-open [out (io/writer f :encoding (or encoding "UTF-8"))] | |
(when header | |
(doseq [c (interpose "," (map #(str "" (name %) "") cols))] | |
(.write out c)) | |
(.write out "\n")) | |
(doseq [l lines] | |
(.write out l)))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment