Skip to content

Instantly share code, notes, and snippets.

@nathanchicken
Last active March 2, 2016 15:25
Show Gist options
  • Save nathanchicken/a0a3624c6859260e8699 to your computer and use it in GitHub Desktop.
Save nathanchicken/a0a3624c6859260e8699 to your computer and use it in GitHub Desktop.
Parses a CSV and outputs successes and failures, based on sanitisation. Currently this is just checking for valid emails. This requires some set up and knowledge of your CSV's columns.
/**
* --------------------
* Configuration
* --------------------
*/
// CSV source
var sourceFile = "Agents_1000s.csv";
// Where to save processed entries
var outputFileName = "agents_processed.csv";
// Where to save the 'failures'
var errorsFileName = "agents_errors.csv";
// CSV saving options. Below is configured to be comma delimited
// and use quotemarks to seperate columns, even if empty. It will
// also add in header labels eg "email","username" which is garnered
// from the object property names defined below
var csvOptions = {
delimiter : ',',
header : false,
quoted : true
};
// MAPPING:
// Map your properties to your position in the array of the CSV you are importing
//
// EG: output.username = record[7] if username was the 8th column of the csv
var mapping = {
username : 7,
email : 2,
displayName : 0
};
// Add your defaults here, perhaps this could be modified to
// be functions that return strings eg random password generator
var defaultColumns = {
password : 'defaultPassword'
};
/**
* --------------------
* Modules
* --------------------
*/
var fs = require('fs');
var parse = require('csv').parse;
var stringify = require('csv').stringify;
var colors = require('colors');
var transform = require('stream-transform');
/**
* --------------------
* Code...
* ---------------------
*/
// bug with the two processed files not being treated seperately.
var csvOptions2 = Object.assign({}, csvOptions);
// create a stream for our source CSV
var input = fs.createReadStream( sourceFile );
// regexp found from the internet - this is just to check emails.
function isEmail(email){
return /^([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x22([^\x0d\x22\x5c\x80-\xff]|\x5c[\x00-\x7f])*\x22)(\x2e([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x22([^\x0d\x22\x5c\x80-\xff]|\x5c[\x00-\x7f])*\x22))*\x40([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x5b([^\x0d\x5b-\x5d\x80-\xff]|\x5c[\x00-\x7f])*\x5d)(\x2e([^\x00-\x20\x22\x28\x29\x2c\x2e\x3a-\x3c\x3e\x40\x5b-\x5d\x7f-\xff]+|\x5b([^\x0d\x5b-\x5d\x80-\xff]|\x5c[\x00-\x7f])*\x5d))*$/.test( email );
}
// Create arrays to store processed data
// store processed in this one...
var processed = [];
// any errors in this one
var errors = [];
// create a CSV parser. This just tells us our source is comma delimited
var count = 0;
function findWhere(list, props) {
var idx = 0;
var len = list.length;
var match = false;
var item, item_k, item_v, prop_k, prop_val;
for (; idx<len; idx++) {
item = list[idx];
for (prop_k in props) {
// If props doesn't own the property, skip it.
if (!props.hasOwnProperty(prop_k)) continue;
// If item doesn't have the property, no match;
if (!item.hasOwnProperty(prop_k)) {
match = false;
break;
}
if (props[prop_k] === item[prop_k]) {
// We have a match…so far.
match = true;
} else {
// No match.
match = false;
// Don't compare more properties.
break;
}
}
// We've iterated all of props' properties, and we still match!
// Return that item!
if (match) return item;
}
// No matches
return null;
}
// create a transform function to go through records
var transformer = function( record, callback ) {
// at this point record is an array of each column in this row.
// create a new object, merging in the default columns.
var obj = Object.assign({}, defaultColumns);
var fail = false;
var reason = '';
// loop through our mapping
for (var prop in mapping) {
obj[prop] = record[ mapping[prop] ];
// check for duplicates
(function() {
var checkObj = {};
checkObj[ prop ] = record[ mapping[prop] ];
var existing = findWhere( processed, checkObj );
if ( existing ) {
// console.log( ( record[ mapping[prop] ] + " exists ").red );
fail = true;
reason += "| " + prop + " exists ";
}
// now check in existing list
})();
}
// EE needs long usernames
if ( obj.username.length < 4) {
fail = true;
reason += "| Username Too Short "
}
if ( obj.username === "naturalhighsafaris") {
fail = true;
reason += "| Username Too Short "
}
// run any sanitisation, change the output, or discard it....
if ( !isEmail( obj.email ) )
{
fail = true;
reason += "| Email not valid ";
}
// run other sanitations here...
// Username
// we're not running callbacks at this point
// callback(); // ???
if ( fail ) {
obj.reason = reason;
// console.log( obj );
errors.push( obj );
} else {
processed.push( obj );
}
// console.log( obj );
};
// loop through our lines and process the data.
// save the CSVs when the read closes.
var readLines = function( ) {
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream( sourceFile )
});
lineReader.on('line', function (line) {
var row = parse( line, {}, function( err, output) {
transformer( output[0] );
count++;
});
});
lineReader.on('close', function() {
stringify(processed, csvOptions, function(err, output){
fs.writeFile( outputFileName, output, false, function() {
console.log( ("saved " + processed.length + " records in ").green + outputFileName.green );
});
});
// Save the failures.
stringify(errors, csvOptions2, function(err, errOutput){
fs.writeFile( errorsFileName, errOutput, false, function() {
console.log( ("saved " + errors.length + " records in ").red + errorsFileName.red );
});
});
});
};
/* run */
// do something here if you wanted to, say, open a member comparison file and run below as a callback.
readLines();
{
"name": "csv",
"version": "1.0.0",
"description": "",
"main": "csv.js",
"dependencies": {
"stream-transform": "^0.1.1",
"csv": "^0.4.6",
"colors": "^1.1.2"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC"
}
@nathanchicken
Copy link
Author

Previous version did not loop through all the data properly. This might be an issue with piping (which I need to learn more about), but it might also be some sort of issue with the stream just being "data" and not being line at a time. ReadLines worked fine though.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment