Skip to content

Instantly share code, notes, and snippets.

@trevordixon
Created August 15, 2012 19:28
Show Gist options
  • Save trevordixon/3362830 to your computer and use it in GitHub Desktop.
Save trevordixon/3362830 to your computer and use it in GitHub Desktop.
Javascript CSV Parser generated by PEG.js
{
var separator = ',';
}
start
= comma
comma
= & { return separator = ','; } sv:sv { return sv; }
tab
= & { return separator = '\t'; } sv:sv { return sv; }
sv
= [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }
line
= first:field rest:(char:. & { return char == separator; } text:field { return text; })*
& { return !!first || rest.length; }
{ rest.unshift(first); return rest; }
field
= '"' text:char* '"' { return text.join(''); }
/ text:(char:[^\n\r] & { return char != separator; } { return char; })*
{ return text.join(''); }
char
= '"' '"' { return '"'; }
/ [^"]
csvParser = (function(){
/*
* Generated by PEG.js 0.7.0.
*
* http://pegjs.majda.cz/
*/
function quote(s) {
/*
* ECMA-262, 5th ed., 7.8.4: All characters may appear literally in a
* string literal except for the closing quote character, backslash,
* carriage return, line separator, paragraph separator, and line feed.
* Any character may appear in the form of an escape sequence.
*
* For portability, we also escape escape all control and non-ASCII
* characters. Note that "\0" and "\v" escape sequences are not used
* because JSHint does not like the first and IE the second.
*/
return '"' + s
.replace(/\\/g, '\\\\') // backslash
.replace(/"/g, '\\"') // closing quote character
.replace(/\x08/g, '\\b') // backspace
.replace(/\t/g, '\\t') // horizontal tab
.replace(/\n/g, '\\n') // line feed
.replace(/\f/g, '\\f') // form feed
.replace(/\r/g, '\\r') // carriage return
.replace(/[\x00-\x07\x0B\x0E-\x1F\x80-\uFFFF]/g, escape)
+ '"';
}
var result = {
/*
* Parses the input with a generated parser. If the parsing is successfull,
* returns a value explicitly or implicitly specified by the grammar from
* which the parser was generated (see |PEG.buildParser|). If the parsing is
* unsuccessful, throws |PEG.parser.SyntaxError| describing the error.
*/
parse: function(input, startRule) {
var parseFunctions = {
"comma": parse_comma,
"tab": parse_tab,
"sv": parse_sv,
"line": parse_line,
"field": parse_field,
"char": parse_char
};
if (startRule !== undefined) {
if (parseFunctions[startRule] === undefined) {
throw new Error("Invalid rule name: " + quote(startRule) + ".");
}
} else {
startRule = "comma";
}
var pos = 0;
var reportFailures = 0;
var rightmostFailuresPos = 0;
var rightmostFailuresExpected = [];
function padLeft(input, padding, length) {
var result = input;
var padLength = length - input.length;
for (var i = 0; i < padLength; i++) {
result = padding + result;
}
return result;
}
function escape(ch) {
var charCode = ch.charCodeAt(0);
var escapeChar;
var length;
if (charCode <= 0xFF) {
escapeChar = 'x';
length = 2;
} else {
escapeChar = 'u';
length = 4;
}
return '\\' + escapeChar + padLeft(charCode.toString(16).toUpperCase(), '0', length);
}
function matchFailed(failure) {
if (pos < rightmostFailuresPos) {
return;
}
if (pos > rightmostFailuresPos) {
rightmostFailuresPos = pos;
rightmostFailuresExpected = [];
}
rightmostFailuresExpected.push(failure);
}
function parse_comma() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
result0 = (function(offset) { return separator = ','; })(pos) ? "" : null;
if (result0 !== null) {
result1 = parse_sv();
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, sv) { return sv; })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_tab() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
result0 = (function(offset) { return separator = '\t'; })(pos) ? "" : null;
if (result0 !== null) {
result1 = parse_sv();
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, sv) { return sv; })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_sv() {
var result0, result1, result2, result3, result4;
var pos0, pos1, pos2, pos3;
pos0 = pos;
pos1 = pos;
result0 = [];
if (/^[\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
while (result1 !== null) {
result0.push(result1);
if (/^[\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
if (result0 !== null) {
result1 = parse_line();
if (result1 !== null) {
result2 = [];
pos2 = pos;
pos3 = pos;
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
if (result4 !== null) {
result3 = [];
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
} else {
result3 = null;
}
if (result3 !== null) {
result4 = parse_line();
if (result4 !== null) {
result3 = [result3, result4];
} else {
result3 = null;
pos = pos3;
}
} else {
result3 = null;
pos = pos3;
}
if (result3 !== null) {
result3 = (function(offset, data) { return data; })(pos2, result3[1]);
}
if (result3 === null) {
pos = pos2;
}
while (result3 !== null) {
result2.push(result3);
pos2 = pos;
pos3 = pos;
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
if (result4 !== null) {
result3 = [];
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
} else {
result3 = null;
}
if (result3 !== null) {
result4 = parse_line();
if (result4 !== null) {
result3 = [result3, result4];
} else {
result3 = null;
pos = pos3;
}
} else {
result3 = null;
pos = pos3;
}
if (result3 !== null) {
result3 = (function(offset, data) { return data; })(pos2, result3[1]);
}
if (result3 === null) {
pos = pos2;
}
}
if (result2 !== null) {
result3 = [];
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
if (result3 !== null) {
result0 = [result0, result1, result2, result3];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, first, rest) { rest.unshift(first); return rest; })(pos0, result0[1], result0[2]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_line() {
var result0, result1, result2, result3, result4;
var pos0, pos1, pos2, pos3;
pos0 = pos;
pos1 = pos;
result0 = parse_field();
if (result0 !== null) {
result1 = [];
pos2 = pos;
pos3 = pos;
if (input.length > pos) {
result2 = input.charAt(pos);
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("any character");
}
}
if (result2 !== null) {
result3 = (function(offset, char) { return char == separator; })(pos, result2) ? "" : null;
if (result3 !== null) {
result4 = parse_field();
if (result4 !== null) {
result2 = [result2, result3, result4];
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
if (result2 !== null) {
result2 = (function(offset, char, text) { return text; })(pos2, result2[0], result2[2]);
}
if (result2 === null) {
pos = pos2;
}
while (result2 !== null) {
result1.push(result2);
pos2 = pos;
pos3 = pos;
if (input.length > pos) {
result2 = input.charAt(pos);
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("any character");
}
}
if (result2 !== null) {
result3 = (function(offset, char) { return char == separator; })(pos, result2) ? "" : null;
if (result3 !== null) {
result4 = parse_field();
if (result4 !== null) {
result2 = [result2, result3, result4];
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
if (result2 !== null) {
result2 = (function(offset, char, text) { return text; })(pos2, result2[0], result2[2]);
}
if (result2 === null) {
pos = pos2;
}
}
if (result1 !== null) {
result2 = (function(offset, first, rest) { return !!first || rest.length; })(pos, result0, result1) ? "" : null;
if (result2 !== null) {
result0 = [result0, result1, result2];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, first, rest) { rest.unshift(first); return rest; })(pos0, result0[0], result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_field() {
var result0, result1, result2;
var pos0, pos1, pos2;
pos0 = pos;
pos1 = pos;
if (input.charCodeAt(pos) === 34) {
result0 = "\"";
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result0 !== null) {
result1 = [];
result2 = parse_char();
while (result2 !== null) {
result1.push(result2);
result2 = parse_char();
}
if (result1 !== null) {
if (input.charCodeAt(pos) === 34) {
result2 = "\"";
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result2 !== null) {
result0 = [result0, result1, result2];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, text) { return text.join(''); })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
if (result0 === null) {
pos0 = pos;
result0 = [];
pos1 = pos;
pos2 = pos;
if (/^[^\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[^\\n\\r]");
}
}
if (result1 !== null) {
result2 = (function(offset, char) { return char != separator; })(pos, result1) ? "" : null;
if (result2 !== null) {
result1 = [result1, result2];
} else {
result1 = null;
pos = pos2;
}
} else {
result1 = null;
pos = pos2;
}
if (result1 !== null) {
result1 = (function(offset, char) { return char; })(pos1, result1[0]);
}
if (result1 === null) {
pos = pos1;
}
while (result1 !== null) {
result0.push(result1);
pos1 = pos;
pos2 = pos;
if (/^[^\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[^\\n\\r]");
}
}
if (result1 !== null) {
result2 = (function(offset, char) { return char != separator; })(pos, result1) ? "" : null;
if (result2 !== null) {
result1 = [result1, result2];
} else {
result1 = null;
pos = pos2;
}
} else {
result1 = null;
pos = pos2;
}
if (result1 !== null) {
result1 = (function(offset, char) { return char; })(pos1, result1[0]);
}
if (result1 === null) {
pos = pos1;
}
}
if (result0 !== null) {
result0 = (function(offset, text) { return text.join(''); })(pos0, result0);
}
if (result0 === null) {
pos = pos0;
}
}
return result0;
}
function parse_char() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
if (input.charCodeAt(pos) === 34) {
result0 = "\"";
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result0 !== null) {
if (input.charCodeAt(pos) === 34) {
result1 = "\"";
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset) { return '"'; })(pos0);
}
if (result0 === null) {
pos = pos0;
}
if (result0 === null) {
if (/^[^"]/.test(input.charAt(pos))) {
result0 = input.charAt(pos);
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("[^\"]");
}
}
}
return result0;
}
function cleanupExpected(expected) {
expected.sort();
var lastExpected = null;
var cleanExpected = [];
for (var i = 0; i < expected.length; i++) {
if (expected[i] !== lastExpected) {
cleanExpected.push(expected[i]);
lastExpected = expected[i];
}
}
return cleanExpected;
}
function computeErrorPosition() {
/*
* The first idea was to use |String.split| to break the input up to the
* error position along newlines and derive the line and column from
* there. However IE's |split| implementation is so broken that it was
* enough to prevent it.
*/
var line = 1;
var column = 1;
var seenCR = false;
for (var i = 0; i < Math.max(pos, rightmostFailuresPos); i++) {
var ch = input.charAt(i);
if (ch === "\n") {
if (!seenCR) { line++; }
column = 1;
seenCR = false;
} else if (ch === "\r" || ch === "\u2028" || ch === "\u2029") {
line++;
column = 1;
seenCR = true;
} else {
column++;
seenCR = false;
}
}
return { line: line, column: column };
}
var separator = ',';
var result = parseFunctions[startRule]();
/*
* The parser is now in one of the following three states:
*
* 1. The parser successfully parsed the whole input.
*
* - |result !== null|
* - |pos === input.length|
* - |rightmostFailuresExpected| may or may not contain something
*
* 2. The parser successfully parsed only a part of the input.
*
* - |result !== null|
* - |pos < input.length|
* - |rightmostFailuresExpected| may or may not contain something
*
* 3. The parser did not successfully parse any part of the input.
*
* - |result === null|
* - |pos === 0|
* - |rightmostFailuresExpected| contains at least one failure
*
* All code following this comment (including called functions) must
* handle these states.
*/
if (result === null || pos !== input.length) {
var offset = Math.max(pos, rightmostFailuresPos);
var found = offset < input.length ? input.charAt(offset) : null;
var errorPosition = computeErrorPosition();
throw new this.SyntaxError(
cleanupExpected(rightmostFailuresExpected),
found,
offset,
errorPosition.line,
errorPosition.column
);
}
return result;
},
/* Returns the parser source code. */
toSource: function() { return this._source; }
};
/* Thrown when a parser encounters a syntax error. */
result.SyntaxError = function(expected, found, offset, line, column) {
function buildMessage(expected, found) {
var expectedHumanized, foundHumanized;
switch (expected.length) {
case 0:
expectedHumanized = "end of input";
break;
case 1:
expectedHumanized = expected[0];
break;
default:
expectedHumanized = expected.slice(0, expected.length - 1).join(", ")
+ " or "
+ expected[expected.length - 1];
}
foundHumanized = found ? quote(found) : "end of input";
return "Expected " + expectedHumanized + " but " + foundHumanized + " found.";
}
this.name = "SyntaxError";
this.expected = expected;
this.found = found;
this.message = buildMessage(expected, found);
this.offset = offset;
this.line = line;
this.column = column;
};
result.SyntaxError.prototype = Error.prototype;
return result;
})();
@martindrapeau
Copy link

martindrapeau commented Apr 3, 2018

Doesn't seem to properly handle escaped quotes. Yet the error reports its expects it? Furthermore, shouldn't spaces following the comma should be ignored when double quotes are present? Here are some test cases:

  1. csvParser.parse('"man \" df"')
    Returns an error: Expected "\"" or [\n\r] but " " found.

  2. csvParser.parse(' "man \" df"')
    Returns: " "man " df""
    Should it not ignore the space here?

  3. csvParser.parse('1, "man \" df"')
    Returns: "1", " "man " df""
    Should it not ignore the space here?

  4. csvParser.parse('1,"man "" df"')
    Returns: "1", "man " df"
    Two double quotes seem to be well supported.

@njamescouk
Copy link

""

fails, which seems strange?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment