Created
January 7, 2022 20:30
-
-
Save steverhall/64c4be94999da82e398f83002b0ea245 to your computer and use it in GitHub Desktop.
Analyze large CSV file and print max size of each column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Assumes there is a header line in first row of file | |
// Usage: dotnet run <input-csv-file> | |
// Output is table of column names and the maximum size of that column | |
// On Macbook Pro, 56 million rows of a 32 column CSV takes approximately 2.5 minutes | |
using System.Text.RegularExpressions; | |
using System.Text; | |
namespace CsvUtility | |
{ | |
class Program | |
{ | |
// Usage: dotnet run <input csv file> | |
// Examines file and prints out the max size of each column | |
static void Main(string[] args) | |
{ | |
// get file name argument | |
string fileName = args[0]; | |
// open file | |
StreamReader file = new StreamReader(fileName); | |
var headerLine = file.ReadLine(); | |
var header = headerLine?.Split(','); | |
if (header == null) | |
{ | |
Console.WriteLine("No header found"); | |
return; | |
} | |
var colWidths = new int[header.Length]; | |
var rowCount = 0; | |
while (!file.EndOfStream) | |
{ | |
var line = file.ReadLine(); | |
if (line != null) { | |
rowCount++; | |
var values = MySplitCSV(line); | |
var fldCtr = 0; | |
foreach (string value in values) | |
{ | |
if (value.Length > colWidths[fldCtr]) | |
{ | |
colWidths[fldCtr] = value.Length; | |
} | |
fldCtr++; | |
} | |
//print every 100000 rows | |
if (rowCount % 100000 == 0) | |
{ | |
Console.Write(String.Format("\rProcessed {0:#,0} rows ", rowCount)); | |
} | |
} | |
} | |
file.Close(); | |
Console.WriteLine("\n\nIndex Column Name Width"); | |
Console.WriteLine("------------|----------------------------------------------|-------"); | |
for (var i = 0; i < colWidths.Length; i++) | |
{ | |
Console.WriteLine("{0,10} {1,-48}{2,6}", i, header[i], colWidths[i]); | |
} | |
Console.WriteLine("\nTotal rows: " + rowCount); | |
} | |
static Regex csvSplit = new Regex("(?:^|,)(\"(?:[^\"]+|\"\")*\"|[^,]*)", RegexOptions.Compiled); | |
public static string[] SplitCSV(string input) | |
{ | |
List<string> list = new List<string>(); | |
foreach (Match match in csvSplit.Matches(input)) | |
{ | |
var curr = match.Value; | |
if (0 == curr.Length) | |
{ | |
list.Add(""); | |
} | |
list.Add(curr.TrimStart(',')); | |
} | |
return list.ToArray(); | |
} | |
// Regex is slow in SplitCSV() so we use this instead | |
public static string[] MySplitCSV(string input) | |
{ | |
bool quote = false; | |
StringBuilder sb = new StringBuilder(); | |
foreach(char c in input) | |
{ | |
if (c == '\"') { | |
quote = !quote; | |
} | |
if (c == ',' && !quote) { | |
sb.Append('\x01'); | |
} else { | |
sb.Append(c); | |
} | |
} | |
var result = sb.ToString().Split('\x01'); | |
return result; | |
} | |
} | |
} |
Quote handling:
This was written so quoted columns are considered, and that there could be commas inside the quotes (which are ignored).
So, a file containing:
100001,"Kirk, James",Captain
Will process correctly as three columns.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample output: