bennadel/CsvParser.cfc

## CsvParser.cfc
component
	output = false
	hint = "I provide methods for parsing CSV (Comma Separated Values) inputs and files."
	{

	this.COMMA = ",";
	this.TAB = chr( 9 );
	// If this delimiter is passed-in, the parser will examine a portion of the input and
	// try to determine, roughly, which delimiter is being used in the data.
	this.GUESS_DELIMITER = "";

	// ---
	// PUBLIC METHODS.
	// ---

	/**
	* I parse the given CSV input value using the given delimiter. The results are
	* returned as a set of nested arrays. The rows are ASSUMED to be delimited by new
	* line and / or row return characters.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	public array function parseCsv(
		required string input,
		string delimiter = this.GUESS_DELIMITER
		) {

		input = removeTrailingLines( input );

		// Short-Circuit: It's not really possible to differentiate an empty input from
		// an input that has an empty row. As such, we're just going to make a judgment
		// call that an empty input has no records.
		if ( ! input.len() ) {

			return( [] );

		}

		if ( delimiter == this.GUESS_DELIMITER ) {

			delimiter = getBestDelimiter( input );

		}

		testDelimiter( delimiter );

		var matcher = getPatternMatcher( input, delimiter );
		var QUOTED_FIELD_GROUP = javaCast( "int", 1 );
		var UNQUOTED_FIELD_GROUP = javaCast( "int", 2 );
		var DELIMITER_GROUP = javaCast( "int", 3 );
		var ESCAPED_QUOTE = """""";
		var UNESCAPTED_QUOTE = """";

		var rows = [];
		var row = [];

		while ( matcher.find() ) {

			// Our pattern matches quoted and unquoted fields in different capturing
			// groups. As such, the only way we can determine which type of field was
			// captured in this match is to see which one of the groups is NULL.
			var quotedField = matcher.group( QUOTED_FIELD_GROUP );

			// If the quotedField variable exists, it means that we captured a quoted
			// field. And, if it's null / undefined, it means that we captured an
			// unquoted field.
			if ( local.keyExists( "quotedField" ) ) {

				row.append( quotedField.replace( ESCAPED_QUOTE, UNESCAPTED_QUOTE, "all" ) );

			} else {

				row.append( matcher.group( UNQUOTED_FIELD_GROUP ) );

			}

			var capturedDelimiter = matcher.group( DELIMITER_GROUP );

			// If our captured delimiter has a length, it was either a field delimiter
			// or a row delimiter.
			if ( capturedDelimiter.len() ) {

				if ( capturedDelimiter == delimiter ) {

					// In the case of a field delimiter, there's nothing to do - the
					// matcher will just move onto the next field.

				} else {

					// In the case of a row delimiter, we need to gather up the current
					// row in the results and then start the next row.
					rows.append( row );
					row = [];

				}

			// If our captured delimiter has no length, it means that it matched the end
			// of the CSV input, which is also the end of the current row. We need to
			// gather up the current row in the results; but, we don't need to bother
			// starting a new row - there will be no more (meaningful) matches.
			} else {

				rows.append( row );
				// In order to prevent an extra empty row from being appended to the
				// results, we have to explicitly break out of the loop.
				break;

			}

		} // END: While.

		// The CSV input is expected to be in the format of FIELD followed by DELIMITER.
		// However, if the user passed-in a delimiter that does not match the delimiter
		// in the actual data, the pattern matcher will never match. In such a case, we
		// need to let the user know that this was unexpected.
		if ( ! rows.len() ) {

			throw(
				type = "CsvParser.UnexpectedEmptyResults",
				message = "Results should not be empty.",
				detail = "This can happen if you use an incompatible delimiter."
			);

		}

		return( rows );

	}


	/**
	* I parse the given CSV files using the given delimiter. The results are returned as
	* a set of nested arrays. The rows are ASSUMED to be delimited by new line and / or
	* row return characters.
	*
	* @inputFile I am the CSV file being parsed.
	* @delimiter I am the field delimiter.
	*/
	public array function parseCsvFile(
		required string inputFile,
		string delimiter = this.GUESS_DELIMITER
		) {

		return( parseCsv( fileRead( inputFile ), delimiter ) );

	}

	// ---
	// PRIVATE METHODS.
	// ---

	/**
	* I examine the given CSV input and determine which delimiter (Comma or Tab) to use.
	* This algorithm is naive and simply looks at which one MAY create more fields.
	*
	* @input I am the CSV input being examined.
	*/
	private string function getBestDelimiter( required string input ) {

		var prefix = input.left( 1000 );
		var commaCount = prefix.listLen( this.COMMA );
		var tabCount = prefix.listLen( this.TAB );

		if ( commaCount >= tabCount ) {

			return( this.COMMA );

		} else {

			return( this.TAB );

		}

	}


	/**
	* I return a Java Pattern Matcher for the given CSV input and field delimiter.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	private any function getPatternMatcher(
		required string input,
		required string delimiter
		) {

		var escapedDelimiter = delimiter.reFind( "\W" )
			? "\#delimiter#"
			: delimiter
		;

		var patternParts = [
			// VERBOSE FLAGE: So that we can add white-space characters in the various
			// parts (for readability) and they won't affect the RegEx pattern mechanics.
			"(?x)",

			// Make sure that the next match picks up exactly where the last match left
			// off. In other words, we only want matches that consume a continuous set of
			// characters.
			"\G",

			// As we match tokens within the CSV input, we're always going to match a
			// FIELD value followed by a DELIMITER (or the end of the file).
			// --
			// NOTE: The "*+" notation is a "Possessive Qualifier", which means that it
			// does not allow the pattern matcher to backtrack if it doesn't find a
			// match. This forces all patterns to be a continuous set of characters (and
			// has a performance benefit as the matcher does not have to remember any
			// backtracking positions).
			"(?:",

				// CAPTURE GROUP 1: A quoted field value.
				'"( [^"]*+ (?>""[^"]*+)* )"',

				"|",

				// CAPTURE GROUP 2: An unquoted field value.
				// --
				// NOTE: I originally had double-quotes in the following character-set;
				// but, I am not sure if I really need it there. Removing it makes the
				// matching ever-so-slightly more flexible.
				'( [^#escapedDelimiter#\r\n]*+ )',

			")",

			// CAPTURE GROUP 3: The delimiter.
			"(
				#escapedDelimiter# |
				\r\n? |
				\n |
				$
			)"
		];

		var patternText = patternParts.toList( "" );

		var matcher = createObject( "java", "java.util.regex.Pattern" )
			.compile( javaCast( "string", patternText ) )
			.matcher( javaCast( "string", input ) )
		;

		return( matcher );

	}


	/**
	* I remove any trailing empty lines.
	*
	* @input I am the CSV data being parsed.
	*/
	private string function removeTrailingLines( required string input ) {

		return( input.reReplace( "[\r\n]+$", "" ) );

	}


	/**
	* I assert that the given delimiter can be used by this parser. If it can, this
	* method exits quietly; if not, it throws an error.
	*
	* @delimiter I am the delimiter being tested.
	*/
	private void function testDelimiter( required string delimiter ) {

		if ( delimiter.len() != 1 ) {

			throw(
				type = "CsvParser.InvalidDelimiter.Length",
				message = "Field delimiter must be one character.",
				detail = "The field delimiter [#delimiter#] is not supported."
			);

		}

		if (
			( delimiter == chr( 13 ) ) ||
			( delimiter == chr( 10 ) )
			) {

			throw(
				type = "CsvParser.InvalidDelimiter.RowDelimiter",
				message = "Field delimiter matches row delimiter.",
				detail = "The field delimiter [#delimiter#] cannot be the same as the implicit row delimiter."
			);

		}

	}

}

## sample.csv
ID,Value,Feature Being Tested
1,hello,A raw value
2,"world",A qualified value
3,"jib,jab","A qualified value with "","" embedded"
4,ben "jamin" nadel,A raw field with embedded quotes
5,,An empty raw field
6,"",An empty qualified field

7,after empty,Making sure the empty line above worked

## snippet-1.cfm
<cfscript>

	/**
	* I return a Java Pattern Matcher for the given CSV input and field delimiter.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	private any function getPatternMatcher(
		required string input,
		required string delimiter
		) {

		var escapedDelimiter = delimiter.reFind( "\W" )
			? "\#delimiter#"
			: delimiter
		;

		var patternParts = [
			// VERBOSE FLAGE: So that we can add white-space characters in the various
			// parts (for readability) and they won't affect the RegEx pattern mechanics.
			"(?x)",

			// Make sure that the next match picks up exactly where the last match left
			// off. In other words, we only want matches that consume a continuous set of
			// characters.
			"\G",

			// As we match tokens within the CSV input, we're always going to match a
			// FIELD value followed by a DELIMITER (or the end of the file).
			// --
			// NOTE: The "*+" notation is a "Possessive Qualifier", which means that it
			// does not allow the pattern matcher to backtrack if it doesn't find a
			// match. This forces all patterns to be a continuous set of characters (and
			// has a performance benefit as the matcher does not have to remember any
			// backtracking positions).
			"(?:",

				// CAPTURE GROUP 1: A quoted field value.
				'"( [^"]*+ (?>""[^"]*+)* )"',

				"|",

				// CAPTURE GROUP 2: An unquoted field value.
				// --
				// NOTE: I originally had double-quotes in the following character-set;
				// but, I am not sure if I really need it there. Removing it makes the
				// matching ever-so-slightly more flexible.
				'( [^#escapedDelimiter#\r\n]*+ )',

			")",

			// CAPTURE GROUP 3: The delimiter.
			"(
				#escapedDelimiter# |
				\r\n? |
				\n |
				$
			)"
		];

		var patternText = patternParts.toList( "" );

		var matcher = createObject( "java", "java.util.regex.Pattern" )
			.compile( javaCast( "string", patternText ) )
			.matcher( javaCast( "string", input ) )
		;

		return( matcher );

	}

</cfscript>

## test.cfm
<cfscript>

	parser = new CsvParser();

	dump( parser.parseCsvFile( "./sample.csv" ) );

</cfscript>
	component
	output = false
	hint = "I provide methods for parsing CSV (Comma Separated Values) inputs and files."
	{

	this.COMMA = ",";
	this.TAB = chr( 9 );
	// If this delimiter is passed-in, the parser will examine a portion of the input and
	// try to determine, roughly, which delimiter is being used in the data.
	this.GUESS_DELIMITER = "";

	// ---
	// PUBLIC METHODS.
	// ---

	/**
	* I parse the given CSV input value using the given delimiter. The results are
	* returned as a set of nested arrays. The rows are ASSUMED to be delimited by new
	* line and / or row return characters.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	public array function parseCsv(
	required string input,
	string delimiter = this.GUESS_DELIMITER
	) {

	input = removeTrailingLines( input );

	// Short-Circuit: It's not really possible to differentiate an empty input from
	// an input that has an empty row. As such, we're just going to make a judgment
	// call that an empty input has no records.
	if ( ! input.len() ) {

	return( [] );

	}

	if ( delimiter == this.GUESS_DELIMITER ) {

	delimiter = getBestDelimiter( input );

	}

	testDelimiter( delimiter );

	var matcher = getPatternMatcher( input, delimiter );
	var QUOTED_FIELD_GROUP = javaCast( "int", 1 );
	var UNQUOTED_FIELD_GROUP = javaCast( "int", 2 );
	var DELIMITER_GROUP = javaCast( "int", 3 );
	var ESCAPED_QUOTE = """""";
	var UNESCAPTED_QUOTE = """";

	var rows = [];
	var row = [];

	while ( matcher.find() ) {

	// Our pattern matches quoted and unquoted fields in different capturing
	// groups. As such, the only way we can determine which type of field was
	// captured in this match is to see which one of the groups is NULL.
	var quotedField = matcher.group( QUOTED_FIELD_GROUP );

	// If the quotedField variable exists, it means that we captured a quoted
	// field. And, if it's null / undefined, it means that we captured an
	// unquoted field.
	if ( local.keyExists( "quotedField" ) ) {

	row.append( quotedField.replace( ESCAPED_QUOTE, UNESCAPTED_QUOTE, "all" ) );

	} else {

	row.append( matcher.group( UNQUOTED_FIELD_GROUP ) );

	}

	var capturedDelimiter = matcher.group( DELIMITER_GROUP );

	// If our captured delimiter has a length, it was either a field delimiter
	// or a row delimiter.
	if ( capturedDelimiter.len() ) {

	if ( capturedDelimiter == delimiter ) {

	// In the case of a field delimiter, there's nothing to do - the
	// matcher will just move onto the next field.

	} else {

	// In the case of a row delimiter, we need to gather up the current
	// row in the results and then start the next row.
	rows.append( row );
	row = [];

	}

	// If our captured delimiter has no length, it means that it matched the end
	// of the CSV input, which is also the end of the current row. We need to
	// gather up the current row in the results; but, we don't need to bother
	// starting a new row - there will be no more (meaningful) matches.
	} else {

	rows.append( row );
	// In order to prevent an extra empty row from being appended to the
	// results, we have to explicitly break out of the loop.
	break;

	}

	} // END: While.

	// The CSV input is expected to be in the format of FIELD followed by DELIMITER.
	// However, if the user passed-in a delimiter that does not match the delimiter
	// in the actual data, the pattern matcher will never match. In such a case, we
	// need to let the user know that this was unexpected.
	if ( ! rows.len() ) {

	throw(
	type = "CsvParser.UnexpectedEmptyResults",
	message = "Results should not be empty.",
	detail = "This can happen if you use an incompatible delimiter."
	);

	}

	return( rows );

	}


	/**
	* I parse the given CSV files using the given delimiter. The results are returned as
	* a set of nested arrays. The rows are ASSUMED to be delimited by new line and / or
	* row return characters.
	*
	* @inputFile I am the CSV file being parsed.
	* @delimiter I am the field delimiter.
	*/
	public array function parseCsvFile(
	required string inputFile,
	string delimiter = this.GUESS_DELIMITER
	) {

	return( parseCsv( fileRead( inputFile ), delimiter ) );

	}

	// ---
	// PRIVATE METHODS.
	// ---

	/**
	* I examine the given CSV input and determine which delimiter (Comma or Tab) to use.
	* This algorithm is naive and simply looks at which one MAY create more fields.
	*
	* @input I am the CSV input being examined.
	*/
	private string function getBestDelimiter( required string input ) {

	var prefix = input.left( 1000 );
	var commaCount = prefix.listLen( this.COMMA );
	var tabCount = prefix.listLen( this.TAB );

	if ( commaCount >= tabCount ) {

	return( this.COMMA );

	} else {

	return( this.TAB );

	}

	}


	/**
	* I return a Java Pattern Matcher for the given CSV input and field delimiter.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	private any function getPatternMatcher(
	required string input,
	required string delimiter
	) {

	var escapedDelimiter = delimiter.reFind( "\W" )
	? "\#delimiter#"
	: delimiter
	;

	var patternParts = [
	// VERBOSE FLAGE: So that we can add white-space characters in the various
	// parts (for readability) and they won't affect the RegEx pattern mechanics.
	"(?x)",

	// Make sure that the next match picks up exactly where the last match left
	// off. In other words, we only want matches that consume a continuous set of
	// characters.
	"\G",

	// As we match tokens within the CSV input, we're always going to match a
	// FIELD value followed by a DELIMITER (or the end of the file).
	// --
	// NOTE: The "*+" notation is a "Possessive Qualifier", which means that it
	// does not allow the pattern matcher to backtrack if it doesn't find a
	// match. This forces all patterns to be a continuous set of characters (and
	// has a performance benefit as the matcher does not have to remember any
	// backtracking positions).
	"(?:",

	// CAPTURE GROUP 1: A quoted field value.
	'"( [^"]+ (?>""[^"]+)* )"',

	"\|",

	// CAPTURE GROUP 2: An unquoted field value.
	// --
	// NOTE: I originally had double-quotes in the following character-set;
	// but, I am not sure if I really need it there. Removing it makes the
	// matching ever-so-slightly more flexible.
	'( [^#escapedDelimiter#\r\n]*+ )',

	")",

	// CAPTURE GROUP 3: The delimiter.
	"(
	#escapedDelimiter# \|
	\r\n? \|
	\n \|
	$
	)"
	];

	var patternText = patternParts.toList( "" );

	var matcher = createObject( "java", "java.util.regex.Pattern" )
	.compile( javaCast( "string", patternText ) )
	.matcher( javaCast( "string", input ) )
	;

	return( matcher );

	}


	/**
	* I remove any trailing empty lines.
	*
	* @input I am the CSV data being parsed.
	*/
	private string function removeTrailingLines( required string input ) {

	return( input.reReplace( "[\r\n]+$", "" ) );

	}


	/**
	* I assert that the given delimiter can be used by this parser. If it can, this
	* method exits quietly; if not, it throws an error.
	*
	* @delimiter I am the delimiter being tested.
	*/
	private void function testDelimiter( required string delimiter ) {

	if ( delimiter.len() != 1 ) {

	throw(
	type = "CsvParser.InvalidDelimiter.Length",
	message = "Field delimiter must be one character.",
	detail = "The field delimiter [#delimiter#] is not supported."
	);

	}

	if (
	( delimiter == chr( 13 ) ) \|\|
	( delimiter == chr( 10 ) )
	) {

	throw(
	type = "CsvParser.InvalidDelimiter.RowDelimiter",
	message = "Field delimiter matches row delimiter.",
	detail = "The field delimiter [#delimiter#] cannot be the same as the implicit row delimiter."
	);

	}

	}

	}
	ID,Value,Feature Being Tested
	1,hello,A raw value
	2,"world",A qualified value
	3,"jib,jab","A qualified value with "","" embedded"
	4,ben "jamin" nadel,A raw field with embedded quotes
	5,,An empty raw field
	6,"",An empty qualified field

	7,after empty,Making sure the empty line above worked
	<cfscript>

	/**
	* I return a Java Pattern Matcher for the given CSV input and field delimiter.
	*
	* @input I am the CSV data being parsed.
	* @delimiter I am the field delimiter.
	*/
	private any function getPatternMatcher(
	required string input,
	required string delimiter
	) {

	var escapedDelimiter = delimiter.reFind( "\W" )
	? "\#delimiter#"
	: delimiter
	;

	var patternParts = [
	// VERBOSE FLAGE: So that we can add white-space characters in the various
	// parts (for readability) and they won't affect the RegEx pattern mechanics.
	"(?x)",

	// Make sure that the next match picks up exactly where the last match left
	// off. In other words, we only want matches that consume a continuous set of
	// characters.
	"\G",

	// As we match tokens within the CSV input, we're always going to match a
	// FIELD value followed by a DELIMITER (or the end of the file).
	// --
	// NOTE: The "*+" notation is a "Possessive Qualifier", which means that it
	// does not allow the pattern matcher to backtrack if it doesn't find a
	// match. This forces all patterns to be a continuous set of characters (and
	// has a performance benefit as the matcher does not have to remember any
	// backtracking positions).
	"(?:",

	// CAPTURE GROUP 1: A quoted field value.
	'"( [^"]+ (?>""[^"]+)* )"',

	"\|",

	// CAPTURE GROUP 2: An unquoted field value.
	// --
	// NOTE: I originally had double-quotes in the following character-set;
	// but, I am not sure if I really need it there. Removing it makes the
	// matching ever-so-slightly more flexible.
	'( [^#escapedDelimiter#\r\n]*+ )',

	")",

	// CAPTURE GROUP 3: The delimiter.
	"(
	#escapedDelimiter# \|
	\r\n? \|
	\n \|
	$
	)"
	];

	var patternText = patternParts.toList( "" );

	var matcher = createObject( "java", "java.util.regex.Pattern" )
	.compile( javaCast( "string", patternText ) )
	.matcher( javaCast( "string", input ) )
	;

	return( matcher );

	}

	</cfscript>
	<cfscript>

	parser = new CsvParser();

	dump( parser.parseCsvFile( "./sample.csv" ) );

	</cfscript>