Skip to content

Instantly share code, notes, and snippets.

@chanibal
Last active August 22, 2018 00:23
Show Gist options
  • Save chanibal/c9d77585f8bcca1ea8894e25352764c5 to your computer and use it in GitHub Desktop.
Save chanibal/c9d77585f8bcca1ea8894e25352764c5 to your computer and use it in GitHub Desktop.
A simple utility for normalizing csv input
#define debuglog
using System;
using System.Diagnostics;
using System.Text;
using Microsoft.VisualStudio.TestTools.UnitTesting;
/// Normalizes CSV lines to fully escaped form - for use in database imports
/// Ex.
/// a,b,c -> 'a','b','c'
/// a,"b,c",d -> 'a','b,c','d'
/// a,it's smth,c -> 'a','it''s smth','c'
sealed class CSVEscaper
{
private enum State
{
start,
intext,
inquote,
endquote
}
private bool IsQuote(char c) { return c == '"'; }
private bool IsSep(char c) { return c == ','; }
const char fieldquote = '\'';
const char separator = ',';
private StringBuilder _output = new StringBuilder();
private void Output(char c)
{
#if debuglog
Log("Outputting " + c);
#endif
_output.Append(c);
}
private State _st;
private void St(State st)
{
#if debuglog
Log("State " + _st +" -> " + st);
#endif
_st = st;
}
#if debuglog
private void Log(string msg)
{
// Debug.Wr iteLine(msg);
// Console.WriteLine(msg);
Trace.WriteLine(msg);
//_output += "{" + msg +"}";
}
#endif
public string Escape(string input)
{
_output.Clear();
_st = State.start;
#if debuglog
Log("Input: " + input);
#endif
foreach(char c in input) {
#if debuglog
Log("Char: " + c);
#endif
switch(_st) {
case State.start:
if(IsQuote(c)) { // quoted field
St(State.inquote);
Output(fieldquote);
}
else if(IsSep(c)) // empty field
{
Output(fieldquote);
Output(fieldquote);
Output(separator);
St(State.start);
}
else { // unqouted field
St(State.intext);
Output(fieldquote);
Output(c);
}
break;
case State.inquote:
if(IsQuote(c)) { // ending quote
St(State.endquote);
}
else {
Output(c);
}
break;
case State.endquote:
if(IsQuote(c)) { // just an escaped quote
Output(c);
}
else if(IsSep(c)) { // end of quote
Output(fieldquote);
Output(separator);
St(State.start);
}
else {
throw new Exception("Illegal quote end sequence");
}
break;
case State.intext:
if(IsSep(c)) {
Output(fieldquote);
Output(separator);
St(State.start);
}
else if(c == fieldquote) { // escape single input quote to double quote
Output(fieldquote);
Output(fieldquote);
}
else {
Output(c);
}
break;
}
}
if(_st == State.start) // if this was at start, that means it ended with an empty field
Output(fieldquote);
Output(fieldquote); // close the current field
#if debuglog
Log("Return: " + _output);
#endif
return _output.ToString();
}
}
/// To run unit tests:
/// $ dotnet new mstest
/// $ dotnet test
[TestClass]
public class CSVEscaperUnitTest
{
private CSVEscaper escaper;
[TestInitialize]
public void TestInitialize()
{
escaper = new CSVEscaper();
}
void Check(string expected, string unescaped)
{
Assert.AreEqual(expected, escaper.Escape(unescaped));
}
[TestMethod]
public void TestCSV()
{
Check("'xyz','a','b','c'", "xyz,a,b,c");
}
[TestMethod]
public void TestCSVQuote()
{
Check("'a','b','c'", "a,\"b\",c");
}
[TestMethod]
public void TestCSVQuoteWithComma()
{
Check("'a','b,x','c'", "a,\"b,x\",c");
}
[TestMethod]
public void TestCSVSingleQuote()
{
Check("'a','it''s smth','c'", "a,it's smth,c");
}
[TestMethod]
public void TestCSVEmpty()
{
Check("'','',''", ",,");
Check("'','',''", "\"\",\"\",\"\"");
}
[TestMethod]
public void TestCSVEmptyVals()
{
Check("'v',''", "v,");
Check("'','v'", ",v");
Check("'v','','v'", "v,,v");
Check("'','v',''", ",v,");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment