Skip to content

Instantly share code, notes, and snippets.

@samba
Created August 30, 2012 23:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samba/3545376 to your computer and use it in GitHub Desktop.
Save samba/3545376 to your computer and use it in GitHub Desktop.
CSV Parser in Javascript
/* CSV parser
* (c) Sam Briesemeister, 2012
*
* Features:
* - Parses CSV through a sequential tokenizer
* - Supports alternative field separator and quote characters
* - Ignores empty lines and comments (lines starting with #)
* - Provides label support through lookup function
*
* Usage:
*
* // sample string
* var mycsv = "one,two\nthree,four\nfive,six";
*
* // simple parser with defaults
* window.CSV(mycsv)[1][1] == 'four';
*
* // standard CSV tokens (defaults, but explicitly given)
* window.CSV(mycsv, ",", "\"")
*
* // CSV parser with labels (ignores first line)
* window.CSV(mycsv, null, null, true).get('one', 1) == 'five';
*
*
* // TSV with single-quote format
* window.CSV(otherstring, "\t", "'")
*
*/
(function(win, doc){
// in-memory cache for regular expressions (tokenizers)
var token_cache = {};
// retrieve a tokenizer from cache if available, or create/store it.
function get_tokenizer(sep, quot){
var n = sep + quot;
return (token_cache[n] = token_cache[n] || new RegExp('(' + quot + '((?:\\\\' + quot + '|(?:\\r?\\n)*|[^' + quot + '])*)' + quot + '|' + sep + '|(\\r?\\n)|([^' + sep + '\\r\\n]*))', 'g'));
}
function add_methods(lbl, rows){
// prepare label map if needed
var map = lbl ? {} : null, i = lbl && lbl.length;
if(i) while(i--) map[ lbl[i] ] = i;
rows['get'] = function(name, row){
var valid_row = (rows.length > row);
if(map && valid_row && name in map && map.hasOwnProperty(name)){
return rows[row][map[name]];
} else if(valid_row && name === Number(name)){
return rows[row][name];
}
};
return rows;
}
// process a CSV string
function parse_data(string, sep, quot, lbl){
var tokenizer = get_tokenizer(sep, quot);
// record state
var rows = [ ], cur = 0, last_nl = false;
// loop over the string by tokenizer
string.replace(tokenizer, function($0, tok, quoted, nl, rest){
rows[cur] = rows[cur] || []; // ensure a current record exists
// order matters here?
if(quoted){
// add a quoted field (possibly including newlines, etc) to the current record
last_nl = false;
return rows[cur].push(quoted);
} else if(nl){
// found new line, queue next record
last_nl = true;
return cur++;
} else if(rest){
// add whatever else we found (an unquoted field) to the current record
last_nl = false;
return rows[cur].push(rest);
} else if(tok == '#' && !last_nl){
}
});
if(lbl){
rows = add_methods(rows['labels'] = rows.shift(), rows);
} else {
rows = add_methods(null, rows);
}
return rows;
}
// Remove comment lines (starting with #) and empty lines
var blanklines = /(\r?\n)+/g, commentlines = /(?:^|\r?\n)#([^\n]*)(\r?\n)/g;
function clean_csv(str){
return str.replace(commentlines, '$2').replace(blanklines, '\n');
}
// export shortcut with sane defaults
var _export = win['CSV'] = function(string, sep, quote, labels){
var sep = sep || ',', quote = quote || '"';
return parse_data(clean_csv(string), sep, quote, labels);
};
})(window, document);
(function(){
function h(a,b){var c=a?{}:null,f=a&&a.length;if(f)for(;f--;)c[a[f]]=f;b.get=function(e,d){var a=b.length>d;if(c&&a&&e in c&&c.hasOwnProperty(e))return b[d][c[e]];if(a&&e===Number(e))return b[d][e]};return b}var i={},j=/(\r?\n)+/g,k=/(?:^|\r?\n)#([^\n]*)(\r?\n)/g;
window.CSV=function(a,b,c,f){var b=b||",",c=c||'"',a=a.replace(k,"$2").replace(j,"\n"),e=b+c,b=i[e]=i[e]||RegExp("("+c+"((?:\\\\"+c+"|(?:\\r?\\n)*|[^"+c+"])*)"+c+"|"+b+"|(\\r?\\n)|([^"+b+"\\r\\n]*))","g"),d=[],g=0;a.replace(b,function(b,c,a,e,f){d[g]=d[g]||[];if(a)return d[g].push(a);if(e)return g++;if(f)return d[g].push(f)});return d=f?h(d.labels=d.shift(),d):h(null,d)};
}());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment