Last active
October 21, 2021 23:18
-
-
Save carymrobbins/b8e7340ce63a22e8d33c to your computer and use it in GitHub Desktop.
Parse CSV strings with PostgreSQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create or replace function parse_csv(s text, raise_on_error bool default true) returns text[] | |
immutable strict language plpgsql as $$ | |
declare | |
result text[] = '{}'; | |
len int = char_length(s); | |
i int = 1; | |
pos int; | |
start_pos int; | |
c char; | |
begin | |
if len = 0 then | |
return result; | |
end if; | |
while i <= len + 1 loop | |
-- If this element starts with a quote, consume until the next unescaped quote. | |
if substr(s, i) like '"%' then | |
i := i + 1; | |
start_pos := i; | |
while true loop | |
-- Find the next quote, it could be an escaped quote or the end of this element. | |
pos := position('"' in substr(s, i)); | |
-- If we can't find another quote, the csv is malformed. | |
if pos = 0 then | |
if raise_on_error then | |
raise exception 'Unable to parse csv, expected string terminator: %', s; | |
else | |
return null; | |
end if; | |
end if; | |
i := i + pos; | |
c := substr(s, i, 1); | |
-- If next char is a " then we are just escaping a quote. | |
if c = '"' then | |
i := i + 1; | |
continue; | |
else | |
-- Otherwise, we'd expect a comma, terminating this field. | |
if c in (',', '') then | |
-- Append this element, unescaping any quotes. | |
result := array_append(result, | |
regexp_replace(substr(s, start_pos, i - start_pos - 1), '""', '"', 'g') | |
); | |
i := i + 1; | |
exit; | |
else | |
if raise_on_error then | |
raise exception 'Unable to parse csv, expected comma or EOF, got ''%'' instead: %', c, s; | |
else | |
return null; | |
end if; | |
end if; | |
end if; | |
end loop; | |
else | |
start_pos := i; | |
pos := position(',' in substr(s, start_pos)); | |
if pos = 0 then | |
result := array_append(result, substr(s, start_pos)); | |
exit; | |
else | |
i := start_pos + pos; | |
result := array_append(result, substr(s, start_pos, greatest(i - start_pos - 1, 0))); | |
end if; | |
end if; | |
end loop; | |
return result; | |
end | |
$$; | |
-- Poor man's test suite. | |
with cte(s, expected) as (values | |
('foo,bar,baz', array['foo','bar','baz']), | |
('"foo",bar,baz', array['foo','bar','baz']), | |
('foo,bar,', array['foo','bar','']), | |
('foo,"bar,baz",quux', array['foo','bar,baz','quux']), | |
('"bar,baz",,quux', array['bar,baz','','quux']), | |
('"","foo ""bar"" baz",quux,"""spam"""', array['', 'foo "bar" baz', 'quux', '"spam"']), | |
('"""foo"""', array['"foo"']), | |
('', array[]::text[]), | |
-- Malformed csv | |
('foo,"bar', null), | |
('"foo,bar,baz', null), | |
('"foo,bar","""baz quux""', null) | |
), parsed as ( | |
select s, parse_csv(s, false) as actual, expected from cte | |
), test_suite as ( | |
select s as test_case, actual, expected, actual is not distinct from expected as passed | |
from parsed | |
) | |
select test_case, | |
case when passed then 'pass' else 'FAIL' end as result, | |
case when passed then null else expected end as expected, | |
case when passed then null else actual end as actual | |
from test_suite; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/rin-nas/postgresql-patterns-library/blob/master/functions/csv_parse.sql