Skip to content

Instantly share code, notes, and snippets.

Created July 26, 2012 20:08
Show Gist options
  • Save anonymous/3184210 to your computer and use it in GitHub Desktop.
Save anonymous/3184210 to your computer and use it in GitHub Desktop.
An example of simple CSV parser (manage a lot of cases)
(ns file.tools.csv.parser
(:require [clojure.string :as s]))
(defn find-csv-fields
"Returns a vector in which item is a field of a csv record.
You have to specify the delimiter and sperator of fields.
Example of field delimiter \".
Example of field sperator ;"
[field-delimiter field-separator csv-record]
(let [quoted-field-pattern (re-pattern (str field-delimiter ".*" field-delimiter))
simple-start-field-pattern (re-pattern (str "^[^" field-delimiter "]"))
simple-end-field-pattern (re-pattern (str "[^" field-delimiter "]$"))
quoted-end-field-pattern (re-pattern (str "(.*\n?)*" field-delimiter "$"))
simple-field-pattern? (fn [v] (and (re-find (re-matcher simple-start-field-pattern v)) (re-find (re-matcher simple-end-field-pattern v))))
clean-field (fn [v]
(if (and (= (first v) (first field-delimiter))
(= (last v) (first field-delimiter)))
(s/replace (subs v 1 (- (count v) 1)) #"\"+" "\"")
v))]
(loop [parts (s/split csv-record (re-pattern field-separator)) , fields [] , incomplete-field []]
(cond
(nil? parts) fields
(and (empty? incomplete-field) (re-matches quoted-field-pattern (first parts)))
(recur (next parts) (conj fields (clean-field (first parts))) [])
(and (empty? incomplete-field) (or (empty? (first parts)) (simple-field-pattern? (first parts))))
(recur (next parts) (conj fields (first parts)) [])
(re-matches quoted-end-field-pattern (first parts))
(recur (next parts)
(conj fields (clean-field
(s/join field-separator (conj incomplete-field (first parts)))))
[])
:else (recur (next parts) fields (conj incomplete-field (first parts)))
))))
(defn new-open-status
"Updates the open-field status by the current cursor in string.
A cursor is the couple of 2 characters from the string."
[status cursor]
(cond
(and status (= cursor [\" \;])) false
(and (not status) (= cursor [\; \"])) true
:else status
)
)
(defn complete-record?
"Returns if a line is a complete record or finalizes a complete record."
([line]
(let [initial-status (if (= (first line) \") false true)]
(complete-record? initial-status line)))
([initial-status line]
(if (reduce new-open-status initial-status (partition 2 1 line)) (= (last line) \") true)))
(defn next-record
"If it can returns a complete record from the line. This functions returns a map that contains this information:
:complete => a boolean tells if the record is complete (i.e. with the line feed in fields if needed)
:record => the current record's value
This function takes either one argument or two. With one argument, the functions initializes a new map as return.
With 2 arguments, the function uses a previous result if it's necessary (i.e. the record isn't complete.)."
([line] {:complete (complete-record? line) :record line})
([line previous]
(if (or (empty? previous) (:complete previous))
(next-record line)
{:complete (complete-record? line) :record (str (:record previous) "\n" line)})
)
)
; Curries the find-csv-fields for French standard CSV
(def find-std-csv-fields (partial find-csv-fields "\"" ";"))
; TESTS
; Fields split for CSV record
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;Christian;Sperandio")))
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Christian\";Sperandio")))
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Un commentaire ; et un autre\";Sperandio")))
(println (s/join "|" (find-std-csv-fields "Mr;\"Un commentaire ; et un autre;;;;et encore un autre\";Sperandio")))
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\";45678")))
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\" et les romains;45678")))
(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel \"\"le pelican\"\" et les romains\";45678")))
(println (s/join "|" (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it")))
(time (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it"))
(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel
\"le pelican\" et
les romains\";45678")))
; Now, I must manage CSV record
(println "-----------------------------------------------------------------------------------------------")
(def buffer ["Mr;\"Un commentaire ; et un autre\";Sperandio"
"12345;\"l'hotel"
" \"le pelican\" et" , "les romains\";45678" ])
(println (map #(str (complete-record? %) "-->" %) buffer))
(def complete-records (loop [previous {} lines buffer records []]
(if (empty? lines)
records
(let [line-status (next-record (first lines) previous)]
(if (:complete line-status)
(recur line-status (rest lines) (conj records (:record line-status)))
(recur line-status (rest lines) records)
)
))
))
(println (s/join "|" complete-records))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment