Skip to content

Instantly share code, notes, and snippets.

@steverhall
Created January 7, 2022 20:30
Show Gist options
  • Save steverhall/64c4be94999da82e398f83002b0ea245 to your computer and use it in GitHub Desktop.
Save steverhall/64c4be94999da82e398f83002b0ea245 to your computer and use it in GitHub Desktop.
Analyze large CSV file and print max size of each column
// Assumes there is a header line in first row of file
// Usage: dotnet run <input-csv-file>
// Output is table of column names and the maximum size of that column
// On Macbook Pro, 56 million rows of a 32 column CSV takes approximately 2.5 minutes
using System.Text.RegularExpressions;
using System.Text;
namespace CsvUtility
{
class Program
{
// Usage: dotnet run <input csv file>
// Examines file and prints out the max size of each column
static void Main(string[] args)
{
// get file name argument
string fileName = args[0];
// open file
StreamReader file = new StreamReader(fileName);
var headerLine = file.ReadLine();
var header = headerLine?.Split(',');
if (header == null)
{
Console.WriteLine("No header found");
return;
}
var colWidths = new int[header.Length];
var rowCount = 0;
while (!file.EndOfStream)
{
var line = file.ReadLine();
if (line != null) {
rowCount++;
var values = MySplitCSV(line);
var fldCtr = 0;
foreach (string value in values)
{
if (value.Length > colWidths[fldCtr])
{
colWidths[fldCtr] = value.Length;
}
fldCtr++;
}
//print every 100000 rows
if (rowCount % 100000 == 0)
{
Console.Write(String.Format("\rProcessed {0:#,0} rows ", rowCount));
}
}
}
file.Close();
Console.WriteLine("\n\nIndex Column Name Width");
Console.WriteLine("------------|----------------------------------------------|-------");
for (var i = 0; i < colWidths.Length; i++)
{
Console.WriteLine("{0,10} {1,-48}{2,6}", i, header[i], colWidths[i]);
}
Console.WriteLine("\nTotal rows: " + rowCount);
}
static Regex csvSplit = new Regex("(?:^|,)(\"(?:[^\"]+|\"\")*\"|[^,]*)", RegexOptions.Compiled);
public static string[] SplitCSV(string input)
{
List<string> list = new List<string>();
foreach (Match match in csvSplit.Matches(input))
{
var curr = match.Value;
if (0 == curr.Length)
{
list.Add("");
}
list.Add(curr.TrimStart(','));
}
return list.ToArray();
}
// Regex is slow in SplitCSV() so we use this instead
public static string[] MySplitCSV(string input)
{
bool quote = false;
StringBuilder sb = new StringBuilder();
foreach(char c in input)
{
if (c == '\"') {
quote = !quote;
}
if (c == ',' && !quote) {
sb.Append('\x01');
} else {
sb.Append(c);
}
}
var result = sb.ToString().Split('\x01');
return result;
}
}
}
@steverhall
Copy link
Author

Quote handling:

This was written so quoted columns are considered, and that there could be commas inside the quotes (which are ignored).

So, a file containing:

100001,"Kirk, James",Captain

Will process correctly as three columns.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment