Created
July 26, 2012 20:08
-
-
Save anonymous/3184210 to your computer and use it in GitHub Desktop.
An example of simple CSV parser (manage a lot of cases)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns file.tools.csv.parser | |
(:require [clojure.string :as s])) | |
(defn find-csv-fields | |
"Returns a vector in which item is a field of a csv record. | |
You have to specify the delimiter and sperator of fields. | |
Example of field delimiter \". | |
Example of field sperator ;" | |
[field-delimiter field-separator csv-record] | |
(let [quoted-field-pattern (re-pattern (str field-delimiter ".*" field-delimiter)) | |
simple-start-field-pattern (re-pattern (str "^[^" field-delimiter "]")) | |
simple-end-field-pattern (re-pattern (str "[^" field-delimiter "]$")) | |
quoted-end-field-pattern (re-pattern (str "(.*\n?)*" field-delimiter "$")) | |
simple-field-pattern? (fn [v] (and (re-find (re-matcher simple-start-field-pattern v)) (re-find (re-matcher simple-end-field-pattern v)))) | |
clean-field (fn [v] | |
(if (and (= (first v) (first field-delimiter)) | |
(= (last v) (first field-delimiter))) | |
(s/replace (subs v 1 (- (count v) 1)) #"\"+" "\"") | |
v))] | |
(loop [parts (s/split csv-record (re-pattern field-separator)) , fields [] , incomplete-field []] | |
(cond | |
(nil? parts) fields | |
(and (empty? incomplete-field) (re-matches quoted-field-pattern (first parts))) | |
(recur (next parts) (conj fields (clean-field (first parts))) []) | |
(and (empty? incomplete-field) (or (empty? (first parts)) (simple-field-pattern? (first parts)))) | |
(recur (next parts) (conj fields (first parts)) []) | |
(re-matches quoted-end-field-pattern (first parts)) | |
(recur (next parts) | |
(conj fields (clean-field | |
(s/join field-separator (conj incomplete-field (first parts))))) | |
[]) | |
:else (recur (next parts) fields (conj incomplete-field (first parts))) | |
)))) | |
(defn new-open-status | |
"Updates the open-field status by the current cursor in string. | |
A cursor is the couple of 2 characters from the string." | |
[status cursor] | |
(cond | |
(and status (= cursor [\" \;])) false | |
(and (not status) (= cursor [\; \"])) true | |
:else status | |
) | |
) | |
(defn complete-record? | |
"Returns if a line is a complete record or finalizes a complete record." | |
([line] | |
(let [initial-status (if (= (first line) \") false true)] | |
(complete-record? initial-status line))) | |
([initial-status line] | |
(if (reduce new-open-status initial-status (partition 2 1 line)) (= (last line) \") true))) | |
(defn next-record | |
"If it can returns a complete record from the line. This functions returns a map that contains this information: | |
:complete => a boolean tells if the record is complete (i.e. with the line feed in fields if needed) | |
:record => the current record's value | |
This function takes either one argument or two. With one argument, the functions initializes a new map as return. | |
With 2 arguments, the function uses a previous result if it's necessary (i.e. the record isn't complete.)." | |
([line] {:complete (complete-record? line) :record line}) | |
([line previous] | |
(if (or (empty? previous) (:complete previous)) | |
(next-record line) | |
{:complete (complete-record? line) :record (str (:record previous) "\n" line)}) | |
) | |
) | |
; Curries the find-csv-fields for French standard CSV | |
(def find-std-csv-fields (partial find-csv-fields "\"" ";")) | |
; TESTS | |
; Fields split for CSV record | |
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;Christian;Sperandio"))) | |
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Christian\";Sperandio"))) | |
(println (s/join "|" (find-csv-fields "\"" ";" "Mr;\"Un commentaire ; et un autre\";Sperandio"))) | |
(println (s/join "|" (find-std-csv-fields "Mr;\"Un commentaire ; et un autre;;;;et encore un autre\";Sperandio"))) | |
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\";45678"))) | |
(println (s/join "|" (find-std-csv-fields "12345;l'hotel \"le pelican\" et les romains;45678"))) | |
(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel \"\"le pelican\"\" et les romains\";45678"))) | |
(println (s/join "|" (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it"))) | |
(time (find-std-csv-fields "\"LE ANGELOT\"\"\";;7 main street;zipcode;Gotham;800125456;;;10;Realtor;Another field;3,1525E+13;6820B;Got it")) | |
(println (s/join "|" (find-std-csv-fields "12345;\"l'hotel | |
\"le pelican\" et | |
les romains\";45678"))) | |
; Now, I must manage CSV record | |
(println "-----------------------------------------------------------------------------------------------") | |
(def buffer ["Mr;\"Un commentaire ; et un autre\";Sperandio" | |
"12345;\"l'hotel" | |
" \"le pelican\" et" , "les romains\";45678" ]) | |
(println (map #(str (complete-record? %) "-->" %) buffer)) | |
(def complete-records (loop [previous {} lines buffer records []] | |
(if (empty? lines) | |
records | |
(let [line-status (next-record (first lines) previous)] | |
(if (:complete line-status) | |
(recur line-status (rest lines) (conj records (:record line-status))) | |
(recur line-status (rest lines) records) | |
) | |
)) | |
)) | |
(println (s/join "|" complete-records)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment