Skip to content

Instantly share code, notes, and snippets.

@trevordixon
Created August 15, 2012 19:28
Show Gist options
  • Save trevordixon/3362830 to your computer and use it in GitHub Desktop.
Save trevordixon/3362830 to your computer and use it in GitHub Desktop.
Javascript CSV Parser generated by PEG.js
{
var separator = ',';
}
start
= comma
comma
= & { return separator = ','; } sv:sv { return sv; }
tab
= & { return separator = '\t'; } sv:sv { return sv; }
sv
= [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }
line
= first:field rest:(char:. & { return char == separator; } text:field { return text; })*
& { return !!first || rest.length; }
{ rest.unshift(first); return rest; }
field
= '"' text:char* '"' { return text.join(''); }
/ text:(char:[^\n\r] & { return char != separator; } { return char; })*
{ return text.join(''); }
char
= '"' '"' { return '"'; }
/ [^"]
csvParser = (function(){
/*
* Generated by PEG.js 0.7.0.
*
* http://pegjs.majda.cz/
*/
function quote(s) {
/*
* ECMA-262, 5th ed., 7.8.4: All characters may appear literally in a
* string literal except for the closing quote character, backslash,
* carriage return, line separator, paragraph separator, and line feed.
* Any character may appear in the form of an escape sequence.
*
* For portability, we also escape escape all control and non-ASCII
* characters. Note that "\0" and "\v" escape sequences are not used
* because JSHint does not like the first and IE the second.
*/
return '"' + s
.replace(/\\/g, '\\\\') // backslash
.replace(/"/g, '\\"') // closing quote character
.replace(/\x08/g, '\\b') // backspace
.replace(/\t/g, '\\t') // horizontal tab
.replace(/\n/g, '\\n') // line feed
.replace(/\f/g, '\\f') // form feed
.replace(/\r/g, '\\r') // carriage return
.replace(/[\x00-\x07\x0B\x0E-\x1F\x80-\uFFFF]/g, escape)
+ '"';
}
var result = {
/*
* Parses the input with a generated parser. If the parsing is successfull,
* returns a value explicitly or implicitly specified by the grammar from
* which the parser was generated (see |PEG.buildParser|). If the parsing is
* unsuccessful, throws |PEG.parser.SyntaxError| describing the error.
*/
parse: function(input, startRule) {
var parseFunctions = {
"comma": parse_comma,
"tab": parse_tab,
"sv": parse_sv,
"line": parse_line,
"field": parse_field,
"char": parse_char
};
if (startRule !== undefined) {
if (parseFunctions[startRule] === undefined) {
throw new Error("Invalid rule name: " + quote(startRule) + ".");
}
} else {
startRule = "comma";
}
var pos = 0;
var reportFailures = 0;
var rightmostFailuresPos = 0;
var rightmostFailuresExpected = [];
function padLeft(input, padding, length) {
var result = input;
var padLength = length - input.length;
for (var i = 0; i < padLength; i++) {
result = padding + result;
}
return result;
}
function escape(ch) {
var charCode = ch.charCodeAt(0);
var escapeChar;
var length;
if (charCode <= 0xFF) {
escapeChar = 'x';
length = 2;
} else {
escapeChar = 'u';
length = 4;
}
return '\\' + escapeChar + padLeft(charCode.toString(16).toUpperCase(), '0', length);
}
function matchFailed(failure) {
if (pos < rightmostFailuresPos) {
return;
}
if (pos > rightmostFailuresPos) {
rightmostFailuresPos = pos;
rightmostFailuresExpected = [];
}
rightmostFailuresExpected.push(failure);
}
function parse_comma() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
result0 = (function(offset) { return separator = ','; })(pos) ? "" : null;
if (result0 !== null) {
result1 = parse_sv();
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, sv) { return sv; })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_tab() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
result0 = (function(offset) { return separator = '\t'; })(pos) ? "" : null;
if (result0 !== null) {
result1 = parse_sv();
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, sv) { return sv; })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_sv() {
var result0, result1, result2, result3, result4;
var pos0, pos1, pos2, pos3;
pos0 = pos;
pos1 = pos;
result0 = [];
if (/^[\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
while (result1 !== null) {
result0.push(result1);
if (/^[\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
if (result0 !== null) {
result1 = parse_line();
if (result1 !== null) {
result2 = [];
pos2 = pos;
pos3 = pos;
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
if (result4 !== null) {
result3 = [];
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
} else {
result3 = null;
}
if (result3 !== null) {
result4 = parse_line();
if (result4 !== null) {
result3 = [result3, result4];
} else {
result3 = null;
pos = pos3;
}
} else {
result3 = null;
pos = pos3;
}
if (result3 !== null) {
result3 = (function(offset, data) { return data; })(pos2, result3[1]);
}
if (result3 === null) {
pos = pos2;
}
while (result3 !== null) {
result2.push(result3);
pos2 = pos;
pos3 = pos;
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
if (result4 !== null) {
result3 = [];
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
} else {
result3 = null;
}
if (result3 !== null) {
result4 = parse_line();
if (result4 !== null) {
result3 = [result3, result4];
} else {
result3 = null;
pos = pos3;
}
} else {
result3 = null;
pos = pos3;
}
if (result3 !== null) {
result3 = (function(offset, data) { return data; })(pos2, result3[1]);
}
if (result3 === null) {
pos = pos2;
}
}
if (result2 !== null) {
result3 = [];
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
while (result4 !== null) {
result3.push(result4);
if (/^[\n\r]/.test(input.charAt(pos))) {
result4 = input.charAt(pos);
pos++;
} else {
result4 = null;
if (reportFailures === 0) {
matchFailed("[\\n\\r]");
}
}
}
if (result3 !== null) {
result0 = [result0, result1, result2, result3];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, first, rest) { rest.unshift(first); return rest; })(pos0, result0[1], result0[2]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_line() {
var result0, result1, result2, result3, result4;
var pos0, pos1, pos2, pos3;
pos0 = pos;
pos1 = pos;
result0 = parse_field();
if (result0 !== null) {
result1 = [];
pos2 = pos;
pos3 = pos;
if (input.length > pos) {
result2 = input.charAt(pos);
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("any character");
}
}
if (result2 !== null) {
result3 = (function(offset, char) { return char == separator; })(pos, result2) ? "" : null;
if (result3 !== null) {
result4 = parse_field();
if (result4 !== null) {
result2 = [result2, result3, result4];
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
if (result2 !== null) {
result2 = (function(offset, char, text) { return text; })(pos2, result2[0], result2[2]);
}
if (result2 === null) {
pos = pos2;
}
while (result2 !== null) {
result1.push(result2);
pos2 = pos;
pos3 = pos;
if (input.length > pos) {
result2 = input.charAt(pos);
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("any character");
}
}
if (result2 !== null) {
result3 = (function(offset, char) { return char == separator; })(pos, result2) ? "" : null;
if (result3 !== null) {
result4 = parse_field();
if (result4 !== null) {
result2 = [result2, result3, result4];
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
} else {
result2 = null;
pos = pos3;
}
if (result2 !== null) {
result2 = (function(offset, char, text) { return text; })(pos2, result2[0], result2[2]);
}
if (result2 === null) {
pos = pos2;
}
}
if (result1 !== null) {
result2 = (function(offset, first, rest) { return !!first || rest.length; })(pos, result0, result1) ? "" : null;
if (result2 !== null) {
result0 = [result0, result1, result2];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, first, rest) { rest.unshift(first); return rest; })(pos0, result0[0], result0[1]);
}
if (result0 === null) {
pos = pos0;
}
return result0;
}
function parse_field() {
var result0, result1, result2;
var pos0, pos1, pos2;
pos0 = pos;
pos1 = pos;
if (input.charCodeAt(pos) === 34) {
result0 = "\"";
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result0 !== null) {
result1 = [];
result2 = parse_char();
while (result2 !== null) {
result1.push(result2);
result2 = parse_char();
}
if (result1 !== null) {
if (input.charCodeAt(pos) === 34) {
result2 = "\"";
pos++;
} else {
result2 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result2 !== null) {
result0 = [result0, result1, result2];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset, text) { return text.join(''); })(pos0, result0[1]);
}
if (result0 === null) {
pos = pos0;
}
if (result0 === null) {
pos0 = pos;
result0 = [];
pos1 = pos;
pos2 = pos;
if (/^[^\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[^\\n\\r]");
}
}
if (result1 !== null) {
result2 = (function(offset, char) { return char != separator; })(pos, result1) ? "" : null;
if (result2 !== null) {
result1 = [result1, result2];
} else {
result1 = null;
pos = pos2;
}
} else {
result1 = null;
pos = pos2;
}
if (result1 !== null) {
result1 = (function(offset, char) { return char; })(pos1, result1[0]);
}
if (result1 === null) {
pos = pos1;
}
while (result1 !== null) {
result0.push(result1);
pos1 = pos;
pos2 = pos;
if (/^[^\n\r]/.test(input.charAt(pos))) {
result1 = input.charAt(pos);
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("[^\\n\\r]");
}
}
if (result1 !== null) {
result2 = (function(offset, char) { return char != separator; })(pos, result1) ? "" : null;
if (result2 !== null) {
result1 = [result1, result2];
} else {
result1 = null;
pos = pos2;
}
} else {
result1 = null;
pos = pos2;
}
if (result1 !== null) {
result1 = (function(offset, char) { return char; })(pos1, result1[0]);
}
if (result1 === null) {
pos = pos1;
}
}
if (result0 !== null) {
result0 = (function(offset, text) { return text.join(''); })(pos0, result0);
}
if (result0 === null) {
pos = pos0;
}
}
return result0;
}
function parse_char() {
var result0, result1;
var pos0, pos1;
pos0 = pos;
pos1 = pos;
if (input.charCodeAt(pos) === 34) {
result0 = "\"";
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result0 !== null) {
if (input.charCodeAt(pos) === 34) {
result1 = "\"";
pos++;
} else {
result1 = null;
if (reportFailures === 0) {
matchFailed("\"\\\"\"");
}
}
if (result1 !== null) {
result0 = [result0, result1];
} else {
result0 = null;
pos = pos1;
}
} else {
result0 = null;
pos = pos1;
}
if (result0 !== null) {
result0 = (function(offset) { return '"'; })(pos0);
}
if (result0 === null) {
pos = pos0;
}
if (result0 === null) {
if (/^[^"]/.test(input.charAt(pos))) {
result0 = input.charAt(pos);
pos++;
} else {
result0 = null;
if (reportFailures === 0) {
matchFailed("[^\"]");
}
}
}
return result0;
}
function cleanupExpected(expected) {
expected.sort();
var lastExpected = null;
var cleanExpected = [];
for (var i = 0; i < expected.length; i++) {
if (expected[i] !== lastExpected) {
cleanExpected.push(expected[i]);
lastExpected = expected[i];
}
}
return cleanExpected;
}
function computeErrorPosition() {
/*
* The first idea was to use |String.split| to break the input up to the
* error position along newlines and derive the line and column from
* there. However IE's |split| implementation is so broken that it was
* enough to prevent it.
*/
var line = 1;
var column = 1;
var seenCR = false;
for (var i = 0; i < Math.max(pos, rightmostFailuresPos); i++) {
var ch = input.charAt(i);
if (ch === "\n") {
if (!seenCR) { line++; }
column = 1;
seenCR = false;
} else if (ch === "\r" || ch === "\u2028" || ch === "\u2029") {
line++;
column = 1;
seenCR = true;
} else {
column++;
seenCR = false;
}
}
return { line: line, column: column };
}
var separator = ',';
var result = parseFunctions[startRule]();
/*
* The parser is now in one of the following three states:
*
* 1. The parser successfully parsed the whole input.
*
* - |result !== null|
* - |pos === input.length|
* - |rightmostFailuresExpected| may or may not contain something
*
* 2. The parser successfully parsed only a part of the input.
*
* - |result !== null|
* - |pos < input.length|
* - |rightmostFailuresExpected| may or may not contain something
*
* 3. The parser did not successfully parse any part of the input.
*
* - |result === null|
* - |pos === 0|
* - |rightmostFailuresExpected| contains at least one failure
*
* All code following this comment (including called functions) must
* handle these states.
*/
if (result === null || pos !== input.length) {
var offset = Math.max(pos, rightmostFailuresPos);
var found = offset < input.length ? input.charAt(offset) : null;
var errorPosition = computeErrorPosition();
throw new this.SyntaxError(
cleanupExpected(rightmostFailuresExpected),
found,
offset,
errorPosition.line,
errorPosition.column
);
}
return result;
},
/* Returns the parser source code. */
toSource: function() { return this._source; }
};
/* Thrown when a parser encounters a syntax error. */
result.SyntaxError = function(expected, found, offset, line, column) {
function buildMessage(expected, found) {
var expectedHumanized, foundHumanized;
switch (expected.length) {
case 0:
expectedHumanized = "end of input";
break;
case 1:
expectedHumanized = expected[0];
break;
default:
expectedHumanized = expected.slice(0, expected.length - 1).join(", ")
+ " or "
+ expected[expected.length - 1];
}
foundHumanized = found ? quote(found) : "end of input";
return "Expected " + expectedHumanized + " but " + foundHumanized + " found.";
}
this.name = "SyntaxError";
this.expected = expected;
this.found = found;
this.message = buildMessage(expected, found);
this.offset = offset;
this.line = line;
this.column = column;
};
result.SyntaxError.prototype = Error.prototype;
return result;
})();
@njamescouk
Copy link

""

fails, which seems strange?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment