Skip to content

Instantly share code, notes, and snippets.

@NigelThorne
Created October 3, 2011 00:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NigelThorne/1258207 to your computer and use it in GitHub Desktop.
Save NigelThorne/1258207 to your computer and use it in GitHub Desktop.
I can't find a CSVParser that escapes strings correctly... so here is something that does what I need. Feel free to fork and improve.
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
namespace NigelThorne
{
public class CSVParser
{
public static IEnumerable<T> ReadCSVFileAs<T>(string filename)
{
return ReadCSVAs<T>(File.ReadAllText(filename));
}
public static IEnumerable<T> ReadCSVAs<T>(string text)
{
var cons = typeof(T).GetConstructors();
var meths = typeof(T).GetFields(BindingFlags.Instance | BindingFlags.Public | BindingFlags.DeclaredOnly);
return ReadCSVAsListOfLists(text).Select(j =>
{
T instance = (T)cons[0].Invoke(new object[0]);
for (int index = 0; index < meths.Length; index++)
{
meths[index].SetValue(instance, j[index]);
}
return instance;
});
}
private static IEnumerable<List<string>> ReadCSVAsListOfLists(string text)
{
var quoted_string = "\"([^\\\\\"]|\\\\.)*\"";
var non_quoted_string = "[^\",\n]*";
var value = "(?<val>({0}|{1}))".With(quoted_string, non_quoted_string);
var valueRegex = new Regex(value, RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.ExplicitCapture);
var escapedValueRegex = new Regex("\\\\(?<x>.)", RegexOptions.Multiline | RegexOptions.Compiled);
int scanned = 0;
var found = new List<List<string>>();
int row = 0;
while (scanned <= text.Length)
{
Match match = valueRegex.Match(text, scanned);
List<string> line = new List<string>();
if (match.Index != scanned)
{
throw new InvalidDataException("Not a valid csv");
}
string val = match.Groups["val"].Value;
if (val.StartsWith("\""))
{
val = escapedValueRegex.Replace(val.Substring(1, val.Length - 2), x => x.Groups[1].Value);
}
scanned += match.Length;
if (found.Count == row) found.Add(new List<string>());
found[row].Add(val);
if (scanned >= text.Length || text[scanned] == '\n')
{
row++;
}
else if (text[scanned] != ',')
{
throw new InvalidDataException("Not a valid csv");
}
scanned += 1;
}
return found;
}
}
}
[TestFixture]
public class ProgramTests
{
public class ABCD
{
public string A;
public string B;
public string C;
public string D;
}
[Test]
public void ReadCSVLines_ReturnsADictionaryBasedOnTheColumnNamesAndFileContent()
{
var output = NigelThorne.CSVParser.ReadCSVAs<ABCD>("a,\"b\",c,d\n1,2,3,4\n\"something, with a comma\",\"something \\\"in\\\" quotes\",\" a \\\\ slash \",\n,,\"\n\",");
Assert.AreEqual(4, output.ToArray().Length);
var row1 = output.ToArray()[0];
Assert.AreEqual("a", row1.A);
Assert.AreEqual("b", row1.B);
Assert.AreEqual("c", row1.C);
Assert.AreEqual("d", row1.D);
var row2 = output.ToArray()[1];
Assert.AreEqual("1", row2.A);
Assert.AreEqual("2", row2.B);
Assert.AreEqual("3", row2.C);
Assert.AreEqual("4", row2.D);
var row3 = output.ToArray()[2];
Assert.AreEqual("something, with a comma", row3.A);
Assert.AreEqual("something \"in\" quotes", row3.B);
Assert.AreEqual(" a \\ slash ", row3.C);
Assert.AreEqual("", row3.D);
var row4 = output.ToArray()[3];
Assert.AreEqual("", row4.A);
Assert.AreEqual("", row4.B);
Assert.AreEqual("\n", row4.C);
Assert.AreEqual("", row4.D);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment