Skip to content

Instantly share code, notes, and snippets.

@Estecka
Last active September 2, 2020 14:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Estecka/604934df6b6fc441fe934026f665fc59 to your computer and use it in GitHub Desktop.
Save Estecka/604934df6b6fc441fe934026f665fc59 to your computer and use it in GitHub Desktop.
AngryCSVparser.cs
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace Estecka {
/// <summary>
/// A parser for those CSV with quoted newlines that tend to mess up other parsers.
/// </summary>
static public class AngryCSVParser {
static char escapeCharacter = '\\';
static char stringDelimiter = '\"';
static char rowSeparator = '\n';
static char cellSeparator = ',';
static Regex newlineRegex = new Regex(
@"\r\n|\r(?!=\n)|\n(?<!=\r)",
RegexOptions.Multiline | RegexOptions.CultureInvariant
);
/// <summary>
/// Replaces all newlines in a string with the provided format.
/// </summary>
/// <param name="input">The string that needs formatting</param>
/// <param name="lineFormat">Should be either "\n", "\r", or "\r\n"</param>
static public string FormatNewlines(string input, string lineFormat){
return newlineRegex.Replace(input, lineFormat);
}
/// <summary>
/// Parses a string by breaking it into pieces, then gluing back together those that should not have been broken.
/// <br/> Although it was originally written to break a csv into rows, it turns out the same logic can be used to break rows into cells.
/// </summary>
/// <param name="whole">The string to break down.</param>
/// <param name="separator">The character to use as a separator.</param>
/// <param name="discardEmpty">If true, empty pieces will be discarded.</param>
static private List<string> ParseComponents(string whole, char separator, bool discardEmpty){
string[] pieces = whole.Split(separator);
List<string> rows = new List<string>(pieces.Length);
// When true, it means we're parsing a quoted string data within the csv.
// As such, separators are ignored an treated as data.
// This stays true until another non-escaped quote is met.
bool isInline = false;
// When true, the next character will be escaped.
// This escapes string delimiters and escape characters, but not separators.
// This only stays true for one character.
bool isEscaped = false;
int row = -1;
for (int line=0; line<pieces.Length; line++){
if (isInline){
// The previous separator was discarded, but is actually part of a string data, so we restore it.
rows[row] += separator;
}
else {
// Initializes a new row;
rows.Add(null);
row++;
}
// An escape character at the end of the previous row should not escape anything.
isEscaped = false;
string lineValue = pieces[line];
foreach (char c in lineValue){
if (isEscaped) {
isEscaped = false;
continue;
}
else if (c == escapeCharacter){
isEscaped = true;
continue;
}
else if (c == stringDelimiter){
isInline = !isInline;
}
}
rows[row] += lineValue;
}
if (discardEmpty)
rows.RemoveAll(r => string.IsNullOrEmpty(r));
return rows;
}
static private List<string> BreakToRows(string csv){
return ParseComponents(csv, rowSeparator, discardEmpty:true);
}
static private List<string> BreakToCells(string line){
return ParseComponents(line, cellSeparator, discardEmpty:false);
}
/// <summary>
/// Trims whitespaces and string delimiters from a cell's value.
/// </summary>
static private string CleanCell(string cell){
return cell
.Trim()
.Trim(stringDelimiter);
;
}
/// <summary>
/// Turn every row from a CSV into a list.
/// </summary>
/// <param name="csv"></param>
/// <returns></returns>
static public List<List<string>> CsvToLists(string csv){
List<string> rows = BreakToRows(csv);
var results = new List<List<string>>(rows.Count);
foreach(string row in rows){
var cells = BreakToCells(row);
for (int i=0; i<cells.Count; i++)
cells[i] = CleanCell(cells[i]);
results.Add(cells);
}
return results;
}
/// <summary>
/// Turn every row from a CSV into a dictionnary.
/// <br/>The first row is actually discarded, and its values used as keys for every other dictionnary.
/// <br/>Result is undefined for values with duplicate or empty keys.
/// </summary>
/// <param name="csv"></param>
/// <returns></returns>
static public Dictionary<string, string>[] CsvToHashtables(string csv){
List<string> rows = BreakToRows(csv);
var results = new Dictionary<string, string>[rows.Count-1];
List<string> keys = BreakToCells(rows[0]);
for (int i=0; i<keys.Count; i++)
keys[i] = CleanCell(keys[i]);
for (int line=1; line<rows.Count; line++){
List<string> row = BreakToCells(rows[line]);
var entry = results[line-1] = new Dictionary<string, string>(row.Count);
for (int cell=0; cell<row.Count && cell<keys.Count; cell++){
string label = keys[cell] ?? string.Empty;
string value = CleanCell(row[cell]);
entry[label] = value;
}
}
return results;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment