public
Created

I can't find a CSVParser that escapes strings correctly... so here is something that does what I need. Feel free to fork and improve.

  • Download Gist
CSVParser.cs
C#
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
 
namespace NigelThorne
{
public class CSVParser
{
public static IEnumerable<T> ReadCSVFileAs<T>(string filename)
{
return ReadCSVAs<T>(File.ReadAllText(filename));
}
 
public static IEnumerable<T> ReadCSVAs<T>(string text)
{
 
var cons = typeof(T).GetConstructors();
var meths = typeof(T).GetFields(BindingFlags.Instance | BindingFlags.Public | BindingFlags.DeclaredOnly);
 
return ReadCSVAsListOfLists(text).Select(j =>
{
T instance = (T)cons[0].Invoke(new object[0]);
for (int index = 0; index < meths.Length; index++)
{
meths[index].SetValue(instance, j[index]);
}
return instance;
});
}
 
private static IEnumerable<List<string>> ReadCSVAsListOfLists(string text)
{
var quoted_string = "\"([^\\\\\"]|\\\\.)*\"";
var non_quoted_string = "[^\",\n]*";
var value = "(?<val>({0}|{1}))".With(quoted_string, non_quoted_string);
var valueRegex = new Regex(value, RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.ExplicitCapture);
var escapedValueRegex = new Regex("\\\\(?<x>.)", RegexOptions.Multiline | RegexOptions.Compiled);
 
int scanned = 0;
var found = new List<List<string>>();
int row = 0;
 
while (scanned <= text.Length)
{
Match match = valueRegex.Match(text, scanned);
List<string> line = new List<string>();
if (match.Index != scanned)
{
throw new InvalidDataException("Not a valid csv");
}
string val = match.Groups["val"].Value;
if (val.StartsWith("\""))
{
val = escapedValueRegex.Replace(val.Substring(1, val.Length - 2), x => x.Groups[1].Value);
}
 
scanned += match.Length;
 
if (found.Count == row) found.Add(new List<string>());
found[row].Add(val);
if (scanned >= text.Length || text[scanned] == '\n')
{
row++;
}
else if (text[scanned] != ',')
{
throw new InvalidDataException("Not a valid csv");
}
scanned += 1;
}
return found;
}
}
}
CSVParserTests.cs
C#
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
[TestFixture]
public class ProgramTests
{
 
 
public class ABCD
{
public string A;
public string B;
public string C;
public string D;
}
 
[Test]
public void ReadCSVLines_ReturnsADictionaryBasedOnTheColumnNamesAndFileContent()
{
var output = NigelThorne.CSVParser.ReadCSVAs<ABCD>("a,\"b\",c,d\n1,2,3,4\n\"something, with a comma\",\"something \\\"in\\\" quotes\",\" a \\\\ slash \",\n,,\"\n\",");
Assert.AreEqual(4, output.ToArray().Length);
var row1 = output.ToArray()[0];
Assert.AreEqual("a", row1.A);
Assert.AreEqual("b", row1.B);
Assert.AreEqual("c", row1.C);
Assert.AreEqual("d", row1.D);
 
 
var row2 = output.ToArray()[1];
Assert.AreEqual("1", row2.A);
Assert.AreEqual("2", row2.B);
Assert.AreEqual("3", row2.C);
Assert.AreEqual("4", row2.D);
 
var row3 = output.ToArray()[2];
Assert.AreEqual("something, with a comma", row3.A);
Assert.AreEqual("something \"in\" quotes", row3.B);
Assert.AreEqual(" a \\ slash ", row3.C);
Assert.AreEqual("", row3.D);
 
var row4 = output.ToArray()[3];
Assert.AreEqual("", row4.A);
Assert.AreEqual("", row4.B);
Assert.AreEqual("\n", row4.C);
Assert.AreEqual("", row4.D);
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.