Skip to content

Instantly share code, notes, and snippets.

@Gopikrishna19
Created July 22, 2022 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gopikrishna19/5104160c50827aa0366997b954f9f701 to your computer and use it in GitHub Desktop.
Save Gopikrishna19/5104160c50827aa0366997b954f9f701 to your computer and use it in GitHub Desktop.
CSV Parser - Final state machine - Javascript
const states = {
startField: 'startField',
startRow: 'startRow',
escaped: 'escaped',
nonQuoted: 'nonQuoted',
quoted: 'quoted',
};
const EOF = 'eof';
module.exports.csvParser = (content) => {
const data = [];
let state = states.startRow;
let fieldBuffer = [];
const startRow = () => {
data.push([]);
};
const endField = () => {
data[data.length - 1].push(fieldBuffer.join(''));
fieldBuffer = [];
};
const machineError = () => new Error('Invalid end of state');
const machine = {
[states.startRow](char) {
if (char === EOF) {
return;
} else if (/\n/.test(char)) {
return states.startRow;
}
startRow();
if (/,/.test(char)) {
endField();
return states.startField;
} else if (/"/.test(char)) {
return states.quoted;
} else if (char) {
fieldBuffer.push(char);
return states.nonQuoted;
}
},
[states.startField](char) {
if (char === EOF) {
// no op
} else if (/,/.test(char)) {
endField();
return states.startField;
} else if (/"/.test(char)) {
return states.quoted;
} else if (/[\n\r]/.test(char)) {
endField();
return states.startRow;
} else if (char) {
fieldBuffer.push(char);
return states.nonQuoted;
} else {
throw machineError();
}
},
[states.escaped](char) {
if (char === EOF) {
endField();
} else if (/"/.test(char)) {
fieldBuffer.push(char);
return states.quoted;
} else if (/,/.test(char)) {
endField();
return states.startField;
} else if (/[\n\r]/.test(char)) {
endField();
return states.startRow;
} else {
throw machineError();
}
},
[states.quoted](char) {
if (/"/.test(char)) {
return states.escaped;
} else if (char !== EOF) {
fieldBuffer.push(char);
return states.quoted;
} else {
throw machineError();
}
},
[states.nonQuoted](char) {
if (char === EOF) {
endField();
} else if (/,/.test(char)) {
endField();
return states.startField;
} else if (/[\n\r]/.test(char)) {
endField();
return states.startRow;
} else if (char) {
fieldBuffer.push(char);
return states.nonQuoted;
} else {
throw machineError();
}
},
};
for (const char of content) {
state = machine[state](char);
}
machine[state](EOF);
return data;
};
// JEST
const {csvParser} = require('./index');
describe('parser', () => {
it('should handle no content', () => {
expect(csvParser('')).toEqual([]);
});
it('should handle one word', () => {
expect(csvParser('hello')).toEqual([['hello']]);
expect(csvParser('"hello"')).toEqual([['hello']]);
});
it('should handle two words', () => {
expect(csvParser('hello,world')).toEqual([['hello', 'world']]);
expect(csvParser('hello,"world"')).toEqual([['hello', 'world']]);
});
it('should handle escaped words', () => {
expect(csvParser('hello,world""')).toEqual([['hello', 'world""']]);
expect(csvParser('hello,"""world"""')).toEqual([['hello', '"world"']]);
});
it('should handle empty words', () => {
expect(csvParser('hello,,world')).toEqual([['hello', '', 'world']]);
expect(csvParser('"hello",,world')).toEqual([['hello', '', 'world']]);
expect(csvParser(',world')).toEqual([['', 'world']]);
expect(csvParser(',\n')).toEqual([['', '']]);
expect(csvParser(',\r')).toEqual([['', '']]);
expect(csvParser(',\r\n')).toEqual([['', '']]);
});
it('should handle lf', () => {
expect(csvParser('hello\n"hello"')).toEqual([['hello'], ['hello']]);
expect(csvParser('hello,world\nhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]);
expect(csvParser('hello,,world\n"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]);
});
it('should handle cr', () => {
expect(csvParser('hello\r"hello"')).toEqual([['hello'], ['hello']]);
expect(csvParser('hello,world\rhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]);
expect(csvParser('hello,,world\r"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]);
});
it('should handle crlf', () => {
expect(csvParser('hello\r\n"hello"')).toEqual([['hello'], ['hello']]);
expect(csvParser('hello,world\r\nhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]);
expect(csvParser('hello,,world\r\n"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]);
});
it('should handle all', () => {
const sampleRows = (lineBreak) => [
'hello,world,"this,is,good"',
'this,has,numbers,1234',
'this,has,special,characters,!@#$%^&*()\'[]{}./\\|-=_+<>?',
'hello,world,"this,is,escaped""quotation"""',
'this,,has,,spaces',
'this has spaces',
`and,"trailing,spaces",with,eol,${lineBreak}`,
].join(lineBreak);
const lfRows = sampleRows('\n');
const crRows = sampleRows('\r');
const crlfRows = sampleRows('\r\n');
const expected = [
['hello', 'world', 'this,is,good'],
['this', 'has', 'numbers', '1234'],
['this', 'has', 'special', 'characters', '!@#$%^&*()\'[]{}./\\|-=_+<>?'],
['hello', 'world', 'this,is,escaped"quotation"'],
['this', '', 'has', '', 'spaces'],
['this has spaces'],
['and', 'trailing,spaces', 'with', 'eol', ''],
];
expect(csvParser(lfRows)).toEqual(expected);
expect(csvParser(crRows)).toEqual(expected);
expect(csvParser(crlfRows)).toEqual(expected);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment