Skip to content

Instantly share code, notes, and snippets.

@samkcarlile
Last active July 21, 2021 17:47
Show Gist options
  • Save samkcarlile/485f1bff7f4686c18cbd6cdeba61b416 to your computer and use it in GitHub Desktop.
Save samkcarlile/485f1bff7f4686c18cbd6cdeba61b416 to your computer and use it in GitHub Desktop.
Parsing a multi-line CSV
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const tweetCSVFile = path.resolve(__dirname, '../all_tweets_classified.csv');
const outFile = path.resolve(__dirname, '../all_tweets_classified.json');
const [header, rows] = parseCSV(tweetCSVFile);
const jsonData = csvToJson([header, rows], {
msg_id: String,
user_id: Number,
text: String,
});
const output = JSON.stringify(jsonData, null, 2);
fs.writeFileSync(outFile, output);
///////////////////////////////////////////////////////////
function csvToJson([header, rows], fieldProcess) {
return rows.map((row) => {
return row.reduce((obj, _value, i) => {
const fieldName = header[i];
const value = fieldProcess?.[fieldName]?.(_value) ?? _value;
return { ...obj, [fieldName]: value };
}, {});
});
}
function parseCSV(filename) {
const contents = fs.readFileSync(filename, 'utf-8');
const header = contents.slice(0, contents.indexOf('\n'));
const data = contents.slice(contents.indexOf('\n') + 1);
const fields = parseHeader(header);
const numFields = fields.length;
const rows = [];
let pos = 0;
while (pos < data.length) {
const [row, newPos] = parseRow(numFields, [data, pos]);
rows.push(row);
pos = newPos;
}
return [fields, rows];
}
function parseHeader(data) {
let pos = 0;
const row = [];
while (pos < data.length) {
const [value, newPos] = parseValue([data, pos]);
row.push(value);
pos = newPos;
}
return row;
}
function parseRow(numFields, [data, pos]) {
const row = [];
for (let i = 0; i < numFields; i++) {
const [value, newPos] = parseValue([data, pos]);
row.push(value);
pos = newPos;
}
return [row, pos];
}
const [COMMA, DBQOUTE, NL] = [',', '"', '\n'];
function parseValue([data, pos]) {
if (data[pos] === COMMA) {
pos++; // eat the comma
return ['', pos];
}
if (data[pos] === DBQOUTE) {
pos++; // eat the dbqoute
const start = pos;
while (data[pos] !== DBQOUTE) pos++;
const end = pos;
pos += 2; // eat the dbqoute and next delimiter (comma | nl)
return [data.slice(start, end), pos];
}
const start = pos;
while (![COMMA, NL, undefined].includes(data[pos])) pos++;
const end = pos;
pos++; // eat comma or newline
return [data.slice(start, end), pos];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment