Skip to content

Instantly share code, notes, and snippets.

@steverhall
Created January 7, 2022 20:30
Show Gist options
  • Save steverhall/64c4be94999da82e398f83002b0ea245 to your computer and use it in GitHub Desktop.
Save steverhall/64c4be94999da82e398f83002b0ea245 to your computer and use it in GitHub Desktop.
Analyze large CSV file and print max size of each column
// Assumes there is a header line in first row of file
// Usage: dotnet run <input-csv-file>
// Output is table of column names and the maximum size of that column
// On Macbook Pro, 56 million rows of a 32 column CSV takes approximately 2.5 minutes
using System.Text.RegularExpressions;
using System.Text;
namespace CsvUtility
{
class Program
{
// Usage: dotnet run <input csv file>
// Examines file and prints out the max size of each column
static void Main(string[] args)
{
// get file name argument
string fileName = args[0];
// open file
StreamReader file = new StreamReader(fileName);
var headerLine = file.ReadLine();
var header = headerLine?.Split(',');
if (header == null)
{
Console.WriteLine("No header found");
return;
}
var colWidths = new int[header.Length];
var rowCount = 0;
while (!file.EndOfStream)
{
var line = file.ReadLine();
if (line != null) {
rowCount++;
var values = MySplitCSV(line);
var fldCtr = 0;
foreach (string value in values)
{
if (value.Length > colWidths[fldCtr])
{
colWidths[fldCtr] = value.Length;
}
fldCtr++;
}
//print every 100000 rows
if (rowCount % 100000 == 0)
{
Console.Write(String.Format("\rProcessed {0:#,0} rows ", rowCount));
}
}
}
file.Close();
Console.WriteLine("\n\nIndex Column Name Width");
Console.WriteLine("------------|----------------------------------------------|-------");
for (var i = 0; i < colWidths.Length; i++)
{
Console.WriteLine("{0,10} {1,-48}{2,6}", i, header[i], colWidths[i]);
}
Console.WriteLine("\nTotal rows: " + rowCount);
}
static Regex csvSplit = new Regex("(?:^|,)(\"(?:[^\"]+|\"\")*\"|[^,]*)", RegexOptions.Compiled);
public static string[] SplitCSV(string input)
{
List<string> list = new List<string>();
foreach (Match match in csvSplit.Matches(input))
{
var curr = match.Value;
if (0 == curr.Length)
{
list.Add("");
}
list.Add(curr.TrimStart(','));
}
return list.ToArray();
}
// Regex is slow in SplitCSV() so we use this instead
public static string[] MySplitCSV(string input)
{
bool quote = false;
StringBuilder sb = new StringBuilder();
foreach(char c in input)
{
if (c == '\"') {
quote = !quote;
}
if (c == ',' && !quote) {
sb.Append('\x01');
} else {
sb.Append(c);
}
}
var result = sb.ToString().Split('\x01');
return result;
}
}
}
@steverhall
Copy link
Author

Sample output:

Index       Column Name                                    Width
------------|----------------------------------------------|-------
         0   subject_id                                          27
         1   emar_id                                             14
         2   emar_seq                                             5
         3   parent_field_ordinal                                 9
         4   administration_type                                 47
         5   pharmacy_id                                          8
         6   barcode_type                                         4
         7   reason_for_no_barcode                              831
         8   complete_dose_not_given                              3
         9   dose_due                                            51
        10   dose_due_unit                                       26
        11   dose_given                                         152
        12   dose_given_unit                                     26
        13   will_remainder_of_dose_be_given                      3
        14   product_amount_given                                23
        15   product_unit                                        13
        16   product_code                                        19
        17   product_description                                181
        18   product_description_other                           97
        19   prior_infusion_rate                                  9
        20   infusion_rate                                       10
        21   infusion_rate_adjustment                            31
        22   infusion_rate_adjustment_amount                     23
        23   infusion_rate_unit                                  19
        24   route                                                6
        25   infusion_complete                                    1
        26   completion_interval                                 23
        27   new_iv_bag_hung                                      1
        28   continued_infusion_in_other_location                 1
        29   restart_interval                                    19
        30   side                                                 6
        31   site                                               236
        32   non_formulary_visual_verification                    1

Total rows: 55947923

@steverhall
Copy link
Author

Quote handling:

This was written so quoted columns are considered, and that there could be commas inside the quotes (which are ignored).

So, a file containing:

100001,"Kirk, James",Captain

Will process correctly as three columns.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment