/simple-csv-parser.clj

## simple-csv-parser.clj
(ns file.tools.csv.parser
  (:require [clojure.string :as s]))

(defn find-csv-fields
  "Returns a vector in which item is a field of a csv record.
  You have to specify the delimiter and sperator of fields.
  Example of field delimiter \".
  Example of field sperator ;"
  [field-delimiter field-separator csv-record]

  (let [quoted-field-pattern (re-pattern (str field-delimiter ".*" field-delimiter))
        simple-start-field-pattern (re-pattern (str "^[^" field-delimiter "]"))
        simple-end-field-pattern (re-pattern (str "[^" field-delimiter "]$"))
        quoted-end-field-pattern (re-pattern (str "(.*\n?)*" field-delimiter "$"))
        simple-field-pattern? (fn [v] (and (re-find (re-matcher simple-start-field-pattern v)) (re-find (re-matcher simple-end-field-pattern v))))
        clean-field (fn [v]
          (if (and (= (first v) (first field-delimiter))
                (= (last v) (first field-delimiter)))
                  (s/replace (subs v 1 (- (count v) 1)) #"\"+" "\"")
                  v))]

    (loop [parts (s/split csv-record (re-pattern field-separator)) , fields [] , incomplete-field []]
      (cond
        (nil? parts) fields

        (and (empty? incomplete-field) (re-matches quoted-field-pattern (first parts)))
          (recur (next parts) (conj fields (clean-field (first parts))) [])

        (and (empty? incomplete-field) (or (empty? (first parts)) (simple-field-pattern? (first parts))))
          (recur (next parts) (conj fields (first parts)) [])

        (re-matches quoted-end-field-pattern (first parts))
          (recur (next parts)
            (conj fields (clean-field
                           (s/join field-separator (conj incomplete-field (first parts)))))
            [])

        :else (recur (next parts) fields (conj incomplete-field (first parts)))
        ))))

(defn new-open-status
  "Updates the open-field status by the current cursor in string.
  A cursor is the couple of 2 characters from the string."
  [status cursor]
  (cond
    (and status (= cursor [\" \;])) false
    (and (not status) (= cursor [\; \"])) true
    :else status
    )
  )

(defn complete-record?
  "Returns if a line is a complete record or finalizes a complete record."
  ([line]
    (let [initial-status (if (= (first line) \") false true)]
            (complete-record? initial-status line)))

  ([initial-status line]
    (if (reduce new-open-status initial-status (partition 2 1 line)) (= (last line) \") true)))

(defn next-record
  "If it can returns a complete record from the line. This functions returns a map that contains this information:
  :complete => a boolean tells if the record is complete (i.e. with the line feed in fields if needed)
  :record => the current record's value
  This function takes either one argument or two. With one argument, the functions initializes a new map as return.
  With 2 arguments, the function uses a previous result if it's necessary (i.e. the record isn't complete.)."
  ([line] {:complete (complete-record? line) :record line})

  ([line previous]
    (if (or (empty? previous) (:complete previous))
      (next-record line)
      {:complete (complete-record? line) :record (str (:record previous) "\n" line)})
    )
  )


; Curries the find-csv-fields for French standard CSV
(def find-std-csv-fields (partial find-csv-fields "\"" ";"))

; TESTS

; Fields split for CSV record
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;Christian;Sperandio")))
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Christian\";Sperandio")))
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Un commentaire ; et un autre\";Sperandio")))
(println (s/join "|" (find-std-csv-fields "Mr;\"Un commentaire ; et un autre;;;;et encore un autre\";Sperandio")))
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\";45678")))
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\" et les romains;45678")))
(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel \"\"le pelican\"\" et les romains\";45678")))

(println (s/join "|" (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it")))
(time (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it"))

(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel
 \"le pelican\" et
  les romains\";45678")))


; Now, I must manage CSV record
(println "-----------------------------------------------------------------------------------------------")
(def buffer ["Mr;\"Un commentaire ; et un autre\";Sperandio"
             "12345;\"l'hotel"
             " \"le pelican\" et" , "les romains\";45678" ])

(println (map #(str (complete-record? %) "-->" %) buffer))

(def complete-records (loop [previous {} lines buffer records []]
                        (if (empty? lines)
                          records
                          (let [line-status (next-record (first lines) previous)]
                            (if (:complete line-status)
                              (recur line-status (rest lines) (conj records (:record line-status)))
                              (recur line-status (rest lines) records)
                              )
                            ))
                        ))

(println (s/join "|" complete-records))
	(ns file.tools.csv.parser
	(:require [clojure.string :as s]))

	(defn find-csv-fields
	"Returns a vector in which item is a field of a csv record.
	You have to specify the delimiter and sperator of fields.
	Example of field delimiter \".
	Example of field sperator ;"
	[field-delimiter field-separator csv-record]

	(let [quoted-field-pattern (re-pattern (str field-delimiter ".*" field-delimiter))
	simple-start-field-pattern (re-pattern (str "^[^" field-delimiter "]"))
	simple-end-field-pattern (re-pattern (str "[^" field-delimiter "]$"))
	quoted-end-field-pattern (re-pattern (str "(.\n?)" field-delimiter "$"))
	simple-field-pattern? (fn [v] (and (re-find (re-matcher simple-start-field-pattern v)) (re-find (re-matcher simple-end-field-pattern v))))
	clean-field (fn [v]
	(if (and (= (first v) (first field-delimiter))
	(= (last v) (first field-delimiter)))
	(s/replace (subs v 1 (- (count v) 1)) #"\"+" "\"")
	v))]

	(loop [parts (s/split csv-record (re-pattern field-separator)) , fields [] , incomplete-field []]
	(cond
	(nil? parts) fields

	(and (empty? incomplete-field) (re-matches quoted-field-pattern (first parts)))
	(recur (next parts) (conj fields (clean-field (first parts))) [])

	(and (empty? incomplete-field) (or (empty? (first parts)) (simple-field-pattern? (first parts))))
	(recur (next parts) (conj fields (first parts)) [])

	(re-matches quoted-end-field-pattern (first parts))
	(recur (next parts)
	(conj fields (clean-field
	(s/join field-separator (conj incomplete-field (first parts)))))
	[])

	:else (recur (next parts) fields (conj incomplete-field (first parts)))
	))))

	(defn new-open-status
	"Updates the open-field status by the current cursor in string.
	A cursor is the couple of 2 characters from the string."
	[status cursor]
	(cond
	(and status (= cursor [\" \;])) false
	(and (not status) (= cursor [\; \"])) true
	:else status
	)
	)

	(defn complete-record?
	"Returns if a line is a complete record or finalizes a complete record."
	([line]
	(let [initial-status (if (= (first line) \") false true)]
	(complete-record? initial-status line)))

	([initial-status line]
	(if (reduce new-open-status initial-status (partition 2 1 line)) (= (last line) \") true)))

	(defn next-record
	"If it can returns a complete record from the line. This functions returns a map that contains this information:
	:complete => a boolean tells if the record is complete (i.e. with the line feed in fields if needed)
	:record => the current record's value
	This function takes either one argument or two. With one argument, the functions initializes a new map as return.
	With 2 arguments, the function uses a previous result if it's necessary (i.e. the record isn't complete.)."
	([line] {:complete (complete-record? line) :record line})

	([line previous]
	(if (or (empty? previous) (:complete previous))
	(next-record line)
	{:complete (complete-record? line) :record (str (:record previous) "\n" line)})
	)
	)


	; Curries the find-csv-fields for French standard CSV
	(def find-std-csv-fields (partial find-csv-fields "\"" ";"))

	; TESTS

	; Fields split for CSV record
	(println (s/join "\|" (find-csv-fields "\"" ";" "Mr;Christian;Sperandio")))
	(println (s/join "\|" (find-csv-fields "\"" ";" "Mr;\"Christian\";Sperandio")))
	(println (s/join "\|" (find-csv-fields "\"" ";" "Mr;\"Un commentaire ; et un autre\";Sperandio")))
	(println (s/join "\|" (find-std-csv-fields "Mr;\"Un commentaire ; et un autre;;;;et encore un autre\";Sperandio")))
	(println (s/join "\|" (find-std-csv-fields "12345;l'hotel \"le pelican\";45678")))
	(println (s/join "\|" (find-std-csv-fields "12345;l'hotel \"le pelican\" et les romains;45678")))
	(println (s/join "\|" (find-std-csv-fields "12345;\"l'hotel \"\"le pelican\"\" et les romains\";45678")))

	(println (s/join "\|" (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it")))
	(time (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it"))

	(println (s/join "\|" (find-std-csv-fields "12345;\"l'hotel
	\"le pelican\" et
	les romains\";45678")))


	; Now, I must manage CSV record
	(println "-----------------------------------------------------------------------------------------------")
	(def buffer ["Mr;\"Un commentaire ; et un autre\";Sperandio"
	"12345;\"l'hotel"
	" \"le pelican\" et" , "les romains\";45678" ])

	(println (map #(str (complete-record? %) "-->" %) buffer))

	(def complete-records (loop [previous {} lines buffer records []]
	(if (empty? lines)
	records
	(let [line-status (next-record (first lines) previous)]
	(if (:complete line-status)
	(recur line-status (rest lines) (conj records (:record line-status)))
	(recur line-status (rest lines) records)
	)
	))
	))

	(println (s/join "\|" complete-records))