ato/csvish-regex.clj

## csvish-regex.clj
    (map #(map second (re-seq #"((?:\"[^\"]*\"|[^ ]+)+)(?: |$)" (second %))) (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" input-string))

Working from the inside out:

    (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" input-string)

Breaks down into:

    (
      (?:
         (?:\"[^\"]*\")+ ;; matches "\"foo\nbar\"" (\n allowed between quotes)
         |               ;;   OR
         [^\"\n]+        ;; matches "anything like this" (no \n)
       )+             ;; one or more of the above
    )  ;; capturing-group (returns the contents from re-seq)
    (?:\n|$) ;; matches either a newline or the end of the string

So applying that to the sample input:

    (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" input-string)
    => (["<x> <y> \"Hello\"\n" "<x> <y> \"Hello\""] ["<xx> <yy> \"hello\nworld.\"\n" "<xx> <yy> \"hello\nworld.\""] ["<a> <b> <z>\n" "<a> <b> <z>"])

Note that we want the second value in each pair.  The first is the whole match and includes the \n.
Thus wrap it in a (map second ...)

    (map second (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" input-string))
    => ("<x> <y> \"Hello\"" "<xx> <yy> \"hello\nworld.\"" "<a> <b> <z>")

Now we need to break each of those up by spaces, as long as those spaces aren't quoted.
So it's the same regex again but this time splitting on spaces:

    (re-seq #"((?:(?:\"[^\"]*\"|[^ ]+)+))(?: |$)" "<x> <y> \"Hello\"")

Breaking that down:

    (
      (?:
         \"[^\"]*\"  ;; matches "\"a\n c\"" (quoted spaces and \n)
         |           ;;   OR
         [^ ]+       ;; matches "anything.without.a.space"
       )+       ;; one or more times
    )       ;; capturing group, return this part
    (?: |$) ;; matches space or end of string

Trying that out:

    (re-seq #"((?:(?:\"[^\"]*\"|[^ ]+)+))(?: |$)" "<x> <y> \"Hello\"")
    => (["<x> " "<x>"] ["<y> " "<y>"] ["\"Hello\"" "\"Hello\""])

Again we want the second element in each pair so:

    (map second (re-seq #"((?:(?:\"[^\"]*\"|[^ ]+)+))(?: |$)" "<x> <y> \"Hello\""))
    => ("<x>" "<y>" "\"Hello\"")

Perfect.  Now just putting the two together by mapping the second regex over the
results of the first and we get the full thing.

    (def input-string "<x> <y> \"Hello\"\n<xx> <yy> \"hello\nworld.\"\n<a> <b> <z>\n")
    (map #(map second (re-seq #"((?:\"[^\"]*\"|[^ ]+)+)(?: |$)" (second %))) (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" input-string))
    => (("<x>" "<y>" "\"Hello\"") ("<xx>" "<yy>" "\"hello\nworld.\"") ("<a>" "<b>" "<z>"))

Now admittedly that's completely unreadable.  In a real project I'd break it up into little functions.
At the very least:

    (defn split-quoted-lines [document]
      (map second (re-seq #"((?:(?:\"[^\"]*\")+|[^\"\n]+)+)(?:\n|$)" document)))

    (defn split-quoted-words [line]
      (map second (re-seq #"((?:\"[^\"]*\"|[^ ]+)+)(?: |$)" line)))

    (defn parse [document]
      (map split-quoted-words (split-quoted-lines document)))

I might even write the regexs broken down:

    (def quoted-line-pattern
      (re-pattern
       (str
        "("
          "(?:"
             "(?:\"[^\"]*\")+" ;; matches "\"foo\nbar\"" (\n allowed between quotes)
             "|"               ;;   OR
             "[^\"\n]+"        ;; matches "anything like this" (no \n)
           ")+"             ;; one or more of the above
        ")"        ;; capturing-group (returns the contents from re-seq)
        "(?:\n|$)" ;; matches either a newline or the end of the string
        )))

    (defn split-quoted-lines [document]
      (map second (re-seq quoted-line-pattern document)))

This would be probably a lot more readable but longer as a manually written character by character parser.
I just  use regexs a lot (especially via sed and grep on the command-line) and I find them pretty quick
to write. Alas terrifying to read for anything but the simplest problem. ;-)
	(map #(map second (re-seq #"((?:\"[^\"]\"\|[^ ]+)+)(?: \|$)" (second %))) (re-seq #"((?:(?:\"[^\"]\")+\|[^\"\n]+)+)(?:\n\|$)" input-string))

	Working from the inside out:

	(re-seq #"((?:(?:\"[^\"]*\")+\|[^\"\n]+)+)(?:\n\|$)" input-string)

	Breaks down into:

	(
	(?:
	(?:\"[^\"]*\")+ ;; matches "\"foo\nbar\"" (\n allowed between quotes)
	\| ;; OR
	[^\"\n]+ ;; matches "anything like this" (no \n)
	)+ ;; one or more of the above
	) ;; capturing-group (returns the contents from re-seq)
	(?:\n\|$) ;; matches either a newline or the end of the string

	So applying that to the sample input:

	(re-seq #"((?:(?:\"[^\"]*\")+\|[^\"\n]+)+)(?:\n\|$)" input-string)
	=> (["<x> <y> \"Hello\"\n" "<x> <y> \"Hello\""] ["<xx> <yy> \"hello\nworld.\"\n" "<xx> <yy> \"hello\nworld.\""] ["<a> <b> <z>\n" "<a> <b> <z>"])

	Note that we want the second value in each pair. The first is the whole match and includes the \n.
	Thus wrap it in a (map second ...)

	(map second (re-seq #"((?:(?:\"[^\"]*\")+\|[^\"\n]+)+)(?:\n\|$)" input-string))
	=> ("<x> <y> \"Hello\"" "<xx> <yy> \"hello\nworld.\"" "<a> <b> <z>")

	Now we need to break each of those up by spaces, as long as those spaces aren't quoted.
	So it's the same regex again but this time splitting on spaces:

	(re-seq #"((?:(?:\"[^\"]*\"\|[^ ]+)+))(?: \|$)" "<x> <y> \"Hello\"")

	Breaking that down:

	(
	(?:
	\"[^\"]*\" ;; matches "\"a\n c\"" (quoted spaces and \n)
	\| ;; OR
	[^ ]+ ;; matches "anything.without.a.space"
	)+ ;; one or more times
	) ;; capturing group, return this part
	(?: \|$) ;; matches space or end of string

	Trying that out:

	(re-seq #"((?:(?:\"[^\"]*\"\|[^ ]+)+))(?: \|$)" "<x> <y> \"Hello\"")
	=> (["<x> " "<x>"] ["<y> " "<y>"] ["\"Hello\"" "\"Hello\""])

	Again we want the second element in each pair so:

	(map second (re-seq #"((?:(?:\"[^\"]*\"\|[^ ]+)+))(?: \|$)" "<x> <y> \"Hello\""))
	=> ("<x>" "<y>" "\"Hello\"")

	Perfect. Now just putting the two together by mapping the second regex over the
	results of the first and we get the full thing.

	(def input-string "<x> <y> \"Hello\"\n<xx> <yy> \"hello\nworld.\"\n<a> <b> <z>\n")
	(map #(map second (re-seq #"((?:\"[^\"]\"\|[^ ]+)+)(?: \|$)" (second %))) (re-seq #"((?:(?:\"[^\"]\")+\|[^\"\n]+)+)(?:\n\|$)" input-string))
	=> (("<x>" "<y>" "\"Hello\"") ("<xx>" "<yy>" "\"hello\nworld.\"") ("<a>" "<b>" "<z>"))

	Now admittedly that's completely unreadable. In a real project I'd break it up into little functions.
	At the very least:

	(defn split-quoted-lines [document]
	(map second (re-seq #"((?:(?:\"[^\"]*\")+\|[^\"\n]+)+)(?:\n\|$)" document)))

	(defn split-quoted-words [line]
	(map second (re-seq #"((?:\"[^\"]*\"\|[^ ]+)+)(?: \|$)" line)))

	(defn parse [document]
	(map split-quoted-words (split-quoted-lines document)))

	I might even write the regexs broken down:

	(def quoted-line-pattern
	(re-pattern
	(str
	"("
	"(?:"
	"(?:\"[^\"]*\")+" ;; matches "\"foo\nbar\"" (\n allowed between quotes)
	"\|" ;; OR
	"[^\"\n]+" ;; matches "anything like this" (no \n)
	")+" ;; one or more of the above
	")" ;; capturing-group (returns the contents from re-seq)
	"(?:\n\|$)" ;; matches either a newline or the end of the string
	)))

	(defn split-quoted-lines [document]
	(map second (re-seq quoted-line-pattern document)))

	This would be probably a lot more readable but longer as a manually written character by character parser.
	I just use regexs a lot (especially via sed and grep on the command-line) and I find them pretty quick
	to write. Alas terrifying to read for anything but the simplest problem. ;-)