Skip to content

Instantly share code, notes, and snippets.

@lerouxb
Created November 30, 2022 11:50
Show Gist options
  • Save lerouxb/225e06fe7d9424ddc7a6e185069ef51d to your computer and use it in GitHub Desktop.
Save lerouxb/225e06fe7d9424ddc7a6e185069ef51d to your computer and use it in GitHub Desktop.
Parse & Analyse .csv files
'use strict';
const fs = require('fs');
const Papa = require('papaparse');
const { ObjectId } = require('bson');
function parseObjectId(value) {
// trying to parse just any string seems to be quite slow, so just support hex strings for now
if (value.length !== 24) {
return null;
}
try {
return new ObjectId(value);
}
catch(err) {
return null;
}
}
// papaparse already detects dates
/*
const ISO_DATE_REGEX = /\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z)/;
function parseDate(value) {
// JavaScript will still happily do:
// > new Date('something 12')
// 2001-12-01T00:00:00.000Z
if (!value.match(ISO_DATE_REGEX)) {
return null;
}
const date = new Date(value);
if (isNaN(date.getTime())) {
return null;
}
return date;
}
*/
function parseCSV(filename) {
return new Promise((resolve, reject) => {
const file = fs.createReadStream(filename, {
encoding: 'utf8',
});
let total = 0;
const schema = {};
Papa.parse(file, {
header: true,
dynamicTyping: true,
step: function(row) {
//console.log(row);
if (total % 10000 === 0) {
console.log(total);
}
for (const [key, value] of Object.entries(row.data)) {
let fieldType = typeof value;
if (value === null) {
// what do we want to do in this case?
fieldType = 'null';
}
else if (fieldType === 'object') {
if (value.getTime) {
fieldType = 'Date';
}
else {
// TODO: probably an error?
console.log(value, value.toString());
}
}
else if (fieldType === 'number') {
// TODO: should we check which kind of number? We could easily end up with a mixed set.
}
else if (fieldType === 'string') {
let parsed;
if ((parsed = parseObjectId(value)) !== null) {
fieldType = 'ObjectID';
}
//else if ((parsed = parseDate(value)) !== null) {
// console.log(parsed, value);
// fieldType = 'Date';
//}
}
if (schema[key] === undefined) {
schema[key] = {};
}
if (schema[key][fieldType] === undefined) {
schema[key][fieldType] = 0;
}
++schema[key][fieldType];
}
++total;
},
complete: function() {
resolve({ total, schema });
},
error: function (err) {
reject(err);
}
});
});
}
function guessFieldTypes(schema) {
const fields = {};
for (const [key, summary] of Object.entries(schema)) {
if (summary.string !== undefined) {
fields[key] = 'string';
continue;
}
// make sure we don't pick null
const types = Object.keys(summary).filter((type) => type !== 'null');
// take the only type if there is just one
if (types.length === 1) {
fields[key] = types[0];
continue;
}
// go with string if there are multiple options or everything was null
fields[key] = 'string'; // TODO: or mixed?
}
return fields;
}
const filename = process.argv[2];
console.log({filename});
const start = Date.now();
parseCSV(filename)
.then(({ total, schema }) => {
const elapsed = Date.now() - start;
console.log();
console.log('field type report');
console.log(schema);
console.log();
console.log('detected field types');
console.log(guessFieldTypes(schema));
console.log();
console.log(`${elapsed}ms`, `${total} rows`);
})
.catch((err) => {
console.error(err);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment