Created
October 3, 2011 00:54
-
-
Save NigelThorne/1258207 to your computer and use it in GitHub Desktop.
I can't find a CSVParser that escapes strings correctly... so here is something that does what I need. Feel free to fork and improve.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Reflection; | |
using System.Text.RegularExpressions; | |
namespace NigelThorne | |
{ | |
public class CSVParser | |
{ | |
public static IEnumerable<T> ReadCSVFileAs<T>(string filename) | |
{ | |
return ReadCSVAs<T>(File.ReadAllText(filename)); | |
} | |
public static IEnumerable<T> ReadCSVAs<T>(string text) | |
{ | |
var cons = typeof(T).GetConstructors(); | |
var meths = typeof(T).GetFields(BindingFlags.Instance | BindingFlags.Public | BindingFlags.DeclaredOnly); | |
return ReadCSVAsListOfLists(text).Select(j => | |
{ | |
T instance = (T)cons[0].Invoke(new object[0]); | |
for (int index = 0; index < meths.Length; index++) | |
{ | |
meths[index].SetValue(instance, j[index]); | |
} | |
return instance; | |
}); | |
} | |
private static IEnumerable<List<string>> ReadCSVAsListOfLists(string text) | |
{ | |
var quoted_string = "\"([^\\\\\"]|\\\\.)*\""; | |
var non_quoted_string = "[^\",\n]*"; | |
var value = "(?<val>({0}|{1}))".With(quoted_string, non_quoted_string); | |
var valueRegex = new Regex(value, RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.ExplicitCapture); | |
var escapedValueRegex = new Regex("\\\\(?<x>.)", RegexOptions.Multiline | RegexOptions.Compiled); | |
int scanned = 0; | |
var found = new List<List<string>>(); | |
int row = 0; | |
while (scanned <= text.Length) | |
{ | |
Match match = valueRegex.Match(text, scanned); | |
List<string> line = new List<string>(); | |
if (match.Index != scanned) | |
{ | |
throw new InvalidDataException("Not a valid csv"); | |
} | |
string val = match.Groups["val"].Value; | |
if (val.StartsWith("\"")) | |
{ | |
val = escapedValueRegex.Replace(val.Substring(1, val.Length - 2), x => x.Groups[1].Value); | |
} | |
scanned += match.Length; | |
if (found.Count == row) found.Add(new List<string>()); | |
found[row].Add(val); | |
if (scanned >= text.Length || text[scanned] == '\n') | |
{ | |
row++; | |
} | |
else if (text[scanned] != ',') | |
{ | |
throw new InvalidDataException("Not a valid csv"); | |
} | |
scanned += 1; | |
} | |
return found; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[TestFixture] | |
public class ProgramTests | |
{ | |
public class ABCD | |
{ | |
public string A; | |
public string B; | |
public string C; | |
public string D; | |
} | |
[Test] | |
public void ReadCSVLines_ReturnsADictionaryBasedOnTheColumnNamesAndFileContent() | |
{ | |
var output = NigelThorne.CSVParser.ReadCSVAs<ABCD>("a,\"b\",c,d\n1,2,3,4\n\"something, with a comma\",\"something \\\"in\\\" quotes\",\" a \\\\ slash \",\n,,\"\n\","); | |
Assert.AreEqual(4, output.ToArray().Length); | |
var row1 = output.ToArray()[0]; | |
Assert.AreEqual("a", row1.A); | |
Assert.AreEqual("b", row1.B); | |
Assert.AreEqual("c", row1.C); | |
Assert.AreEqual("d", row1.D); | |
var row2 = output.ToArray()[1]; | |
Assert.AreEqual("1", row2.A); | |
Assert.AreEqual("2", row2.B); | |
Assert.AreEqual("3", row2.C); | |
Assert.AreEqual("4", row2.D); | |
var row3 = output.ToArray()[2]; | |
Assert.AreEqual("something, with a comma", row3.A); | |
Assert.AreEqual("something \"in\" quotes", row3.B); | |
Assert.AreEqual(" a \\ slash ", row3.C); | |
Assert.AreEqual("", row3.D); | |
var row4 = output.ToArray()[3]; | |
Assert.AreEqual("", row4.A); | |
Assert.AreEqual("", row4.B); | |
Assert.AreEqual("\n", row4.C); | |
Assert.AreEqual("", row4.D); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment