Skip to content

Instantly share code, notes, and snippets.

@steverhall
Last active January 10, 2022 21:10
Show Gist options
  • Save steverhall/e6a816a7f095045622438797885f30f6 to your computer and use it in GitHub Desktop.
Save steverhall/e6a816a7f095045622438797885f30f6 to your computer and use it in GitHub Desktop.
Parses CSV file and returns the max column width for each column in the file
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
#define TRUE 1
#define FALSE 0
#define MAX_COLUMNS 255
#define MAX_ERRORS 10
#define MAX_COLUMN_NAME_LENGTH 50
#define MAX_INPUT_LINE_LENGTH 1000
#define VARCHAR 'V'
#define INT 'I'
#define FLOAT 'F'
#define DATE 'D'
#define DATETIME 'T'
#define DEFAULT_COLUMN_WIDTH 20
void parseHeaderColumns(char out[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], char *line, int *numColumns);
void printSummary(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns);
int processLine(char *line, int columnWidths[], long largestLineNumber[], char columnType[MAX_COLUMNS], int rowNum);
char calcColumnType(char existingColumnType, char colData[]);
void checkForErrors(int foundColumns, int numColumns, long rowNum);
void printCreateTable(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns);
/* Create main program loop */
int main(int argc, char *argv[]) {
char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH];
char columnType[MAX_COLUMNS];
char line[MAX_INPUT_LINE_LENGTH];
int columnWidths[MAX_COLUMNS] = {0};
long largestLineNumber[MAX_COLUMNS] = {0}, rowNum = 1;
int numColumns = 0;
memset(columnType, 0, MAX_COLUMN_NAME_LENGTH);
/* Read in the input file */
FILE *input = fopen(argv[1], "r");
if (input == NULL) {
printf("Error: Could not open input file %s\n", argv[1]);
return 1;
}
/* Read first line in file */
if (!fgets(line, MAX_INPUT_LINE_LENGTH, input)) {
printf("Error: Could not read first line in file\n");
fclose(input);
return 1;
}
/* Parse first line */
parseHeaderColumns(columns, line, &numColumns);
while (!feof(input)) {
/* Read in line */
fgets(line, MAX_INPUT_LINE_LENGTH, input);
int foundColumns = processLine(line, columnWidths, largestLineNumber, columnType, rowNum);
checkForErrors(foundColumns, numColumns, rowNum);
rowNum++;
}
fclose(input);
printSummary(columns, columnWidths, columnType, largestLineNumber, numColumns);
printCreateTable(columns, columnWidths, columnType, largestLineNumber, numColumns);
exit(0);
}
void parseHeaderColumns(char out[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], char *line, int *numColumns) {
char *token = strtok(line, ",");
*numColumns = 0;
while (token != NULL) {
strcpy(out[*numColumns], token);
token = strtok(NULL, ",");
*numColumns += 1;
}
// Remove trailing newline
int r = *numColumns - 1;
if (r > 0) {
int c = strlen(out[r]) - 1;
if (r > 0 && c > 0) {
out[r][c] = '\0';
}
}
}
void printSummary(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns) {
printf("\n");
printf("Index Column Name Type Width Largest Row\n");
printf("------|--------------------------------------------------|------|-----------|--------------\n");
for (int i = 0; i < numColumns; i++) {
printf("%-7d%-52s%4c%12d%15ld\n", i, columns[i], columnType[i], columnWidths[i], largestLineNumber[i]);
}
}
int processLine(char *line, int columnWidths[], long largestLineNumber[], char columnType[MAX_COLUMNS], int rowNum) {
/* Parse line */
int quote = FALSE;
int colNum = 0, colStartIdx = 0, fieldIdx = 0;
int length = strlen(line);
char colData[5000];
memset(colData, 0, 5000);
for (int i = 0; i < length; i++) {
colData[fieldIdx] = line[i];
switch (line[i]) {
case '"':
quote = (quote == TRUE) ? FALSE : TRUE;
columnType[colNum] = VARCHAR; // quoted columns will be considered as varchar
break;
case ',':
if (quote == FALSE) {
int colWidth = i - colStartIdx;
colData[fieldIdx] = '\0';
// Check if current column width is larger than any previous
if (columnWidths[colNum] < colWidth) {
columnWidths[colNum] = colWidth;
largestLineNumber[colNum] = rowNum;
}
char colType = calcColumnType(columnType[colNum], colData);
if (colType > 0) {
columnType[colNum] = colType;
}
fieldIdx = 0; // Reset colIdx
colNum++; // Move to next column
colStartIdx = i + 1;
memset(colData, 0, 5000); // Reset colData
}
break;
default:
if (isalpha(line[i])) {
columnType[colNum] = VARCHAR;
};
fieldIdx++;
break;
}
}
// Need to handle last column, including determining it's type
int colWidth = length - 1 - colStartIdx;
if (columnWidths[colNum] < colWidth) {
columnWidths[colNum] = colWidth;
largestLineNumber[colNum] = rowNum;
}
char colType = calcColumnType(columnType[colNum], colData);
if (colType > 0) {
columnType[colNum] = colType;
}
return colNum;
}
char calcColumnType(char existingColumnType, char colData[]) {
// If field has no length, it's a VARCHAR
if (strlen(colData) == 0) {
return VARCHAR;
}
if (existingColumnType != VARCHAR) { // If column is already VARCHAR type, don't need to check
if (strchr(colData, '.') != NULL) {
return FLOAT;
} else {
if (existingColumnType != FLOAT) {
return INT;
}
}
}
return 0;
}
void checkForErrors(int foundColumns, int numColumns, long rowNum) {
static int numErrors = 0;
if (numErrors >= MAX_ERRORS) {
return;
}
if (foundColumns + 1 != numColumns) {
fprintf(stderr, "Warning: Line %ld has %d columns, expected %d\n", rowNum, foundColumns, numColumns);
numErrors++;
if (numErrors == MAX_ERRORS) {
fprintf(stderr, "Maximum number of errors reached. Further warnings will be suppressed.\n");
}
}
}
void printCreateTable(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns) {
printf("\n\nCREATE TABLE MYTABLE (\n");
for (int i = 0; i < numColumns; i++) {
switch (columnType[i]) {
case INT:
printf(" %s INT", columns[i]);
break;
case FLOAT:
printf(" %s FLOAT", columns[i]);
break;
case VARCHAR:
printf(" %s VARCHAR(%d)", columns[i], columnWidths[i] == 0 ? DEFAULT_COLUMN_WIDTH : columnWidths[i]);
break;
default:
printf(" %s VARCHAR(%d)", columns[i], columnWidths[i] == 0 ? DEFAULT_COLUMN_WIDTH : columnWidths[i]);
break;
}
if (i < numColumns - 1) {
printf(",\n");
} else {
printf("\n");
}
}
printf(");\n");
}
@steverhall
Copy link
Author

steverhall commented Jan 9, 2022

Usage:
Compile with: gcc csvutil.c -o csvutil
Run with: ./csvutil <filename>

Example output:

Index  Column Name                                        Type       Width      Largest Row
------|--------------------------------------------------|------|-----------|--------------
0      subject_id                                             I           8              1
1      emar_id                                                I          11              1
2      emar_seq                                               I           2              1
3      parent_field_ordinal                                   F           3              1
4      administration_type                                    V           0              0
5      pharmacy_id                                            I           8              1
6      barcode_type                                           V           2              1
7      reason_for_no_barcode                                  V           0              0
8      complete_dose_not_given                                V           0              0
9      dose_due                                               V           0              0
10     dose_due_unit                                          V           0              0
11     dose_given                                             I           2              1
12     dose_given_unit                                        V           3              1
13     will_remainder_of_dose_be_given                        V           0              0
14     product_amount_given                                   V           3             23
15     product_unit                                           V           3              1
16     product_code                                           V           9             23
17     product_description                                    V          34              1
18     product_description_other                              V           0              0
19     prior_infusion_rate                                    V           0              0
20     infusion_rate                                          V           0              0
21     infusion_rate_adjustment                               V           0              0
22     infusion_rate_adjustment_amount                        V           0              0
23     infusion_rate_unit                                     V           0              0
24     route                                                  V           0              0
25     infusion_complete                                      V           0              0
26     completion_interval                                    V           0              0
27     new_iv_bag_hung                                        V           0              0
28     continued_infusion_in_other_location                   V           0              0
29     restart_interval                                       V           0              0
30     side                                                   V           0              0
31     site                                                   V           0              0
32     non_formulary_visual_verification                      V           0              0
33     comments                                               V          34             12


CREATE TABLE MYTABLE (
    subject_id INT,
    emar_id INT,
    emar_seq INT,
    parent_field_ordinal FLOAT,
    administration_type VARCHAR(20),
    pharmacy_id INT,
    barcode_type VARCHAR(2),
    reason_for_no_barcode VARCHAR(20),
    complete_dose_not_given VARCHAR(20),
    dose_due VARCHAR(20),
    dose_due_unit VARCHAR(20),
    dose_given INT,
    dose_given_unit VARCHAR(3),
    will_remainder_of_dose_be_given VARCHAR(20),
    product_amount_given VARCHAR(3),
    product_unit VARCHAR(3),
    product_code VARCHAR(9),
    product_description VARCHAR(34),
    product_description_other VARCHAR(20),
    prior_infusion_rate VARCHAR(20),
    infusion_rate VARCHAR(20),
    infusion_rate_adjustment VARCHAR(20),
    infusion_rate_adjustment_amount VARCHAR(20),
    infusion_rate_unit VARCHAR(20),
    route VARCHAR(20),
    infusion_complete VARCHAR(20),
    completion_interval VARCHAR(20),
    new_iv_bag_hung VARCHAR(20),
    continued_infusion_in_other_location VARCHAR(20),
    restart_interval VARCHAR(20),
    side VARCHAR(20),
    site VARCHAR(20),
    non_formulary_visual_verification VARCHAR(20),
    comments VARCHAR(34)
);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment