Last active
January 10, 2022 21:10
-
-
Save steverhall/e6a816a7f095045622438797885f30f6 to your computer and use it in GitHub Desktop.
Parses CSV file and returns the max column width for each column in the file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <getopt.h> | |
#include <ctype.h> | |
#define TRUE 1 | |
#define FALSE 0 | |
#define MAX_COLUMNS 255 | |
#define MAX_ERRORS 10 | |
#define MAX_COLUMN_NAME_LENGTH 50 | |
#define MAX_INPUT_LINE_LENGTH 1000 | |
#define VARCHAR 'V' | |
#define INT 'I' | |
#define FLOAT 'F' | |
#define DATE 'D' | |
#define DATETIME 'T' | |
#define DEFAULT_COLUMN_WIDTH 20 | |
void parseHeaderColumns(char out[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], char *line, int *numColumns); | |
void printSummary(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns); | |
int processLine(char *line, int columnWidths[], long largestLineNumber[], char columnType[MAX_COLUMNS], int rowNum); | |
char calcColumnType(char existingColumnType, char colData[]); | |
void checkForErrors(int foundColumns, int numColumns, long rowNum); | |
void printCreateTable(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns); | |
/* Create main program loop */ | |
int main(int argc, char *argv[]) { | |
char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH]; | |
char columnType[MAX_COLUMNS]; | |
char line[MAX_INPUT_LINE_LENGTH]; | |
int columnWidths[MAX_COLUMNS] = {0}; | |
long largestLineNumber[MAX_COLUMNS] = {0}, rowNum = 1; | |
int numColumns = 0; | |
memset(columnType, 0, MAX_COLUMN_NAME_LENGTH); | |
/* Read in the input file */ | |
FILE *input = fopen(argv[1], "r"); | |
if (input == NULL) { | |
printf("Error: Could not open input file %s\n", argv[1]); | |
return 1; | |
} | |
/* Read first line in file */ | |
if (!fgets(line, MAX_INPUT_LINE_LENGTH, input)) { | |
printf("Error: Could not read first line in file\n"); | |
fclose(input); | |
return 1; | |
} | |
/* Parse first line */ | |
parseHeaderColumns(columns, line, &numColumns); | |
while (!feof(input)) { | |
/* Read in line */ | |
fgets(line, MAX_INPUT_LINE_LENGTH, input); | |
int foundColumns = processLine(line, columnWidths, largestLineNumber, columnType, rowNum); | |
checkForErrors(foundColumns, numColumns, rowNum); | |
rowNum++; | |
} | |
fclose(input); | |
printSummary(columns, columnWidths, columnType, largestLineNumber, numColumns); | |
printCreateTable(columns, columnWidths, columnType, largestLineNumber, numColumns); | |
exit(0); | |
} | |
void parseHeaderColumns(char out[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], char *line, int *numColumns) { | |
char *token = strtok(line, ","); | |
*numColumns = 0; | |
while (token != NULL) { | |
strcpy(out[*numColumns], token); | |
token = strtok(NULL, ","); | |
*numColumns += 1; | |
} | |
// Remove trailing newline | |
int r = *numColumns - 1; | |
if (r > 0) { | |
int c = strlen(out[r]) - 1; | |
if (r > 0 && c > 0) { | |
out[r][c] = '\0'; | |
} | |
} | |
} | |
void printSummary(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns) { | |
printf("\n"); | |
printf("Index Column Name Type Width Largest Row\n"); | |
printf("------|--------------------------------------------------|------|-----------|--------------\n"); | |
for (int i = 0; i < numColumns; i++) { | |
printf("%-7d%-52s%4c%12d%15ld\n", i, columns[i], columnType[i], columnWidths[i], largestLineNumber[i]); | |
} | |
} | |
int processLine(char *line, int columnWidths[], long largestLineNumber[], char columnType[MAX_COLUMNS], int rowNum) { | |
/* Parse line */ | |
int quote = FALSE; | |
int colNum = 0, colStartIdx = 0, fieldIdx = 0; | |
int length = strlen(line); | |
char colData[5000]; | |
memset(colData, 0, 5000); | |
for (int i = 0; i < length; i++) { | |
colData[fieldIdx] = line[i]; | |
switch (line[i]) { | |
case '"': | |
quote = (quote == TRUE) ? FALSE : TRUE; | |
columnType[colNum] = VARCHAR; // quoted columns will be considered as varchar | |
break; | |
case ',': | |
if (quote == FALSE) { | |
int colWidth = i - colStartIdx; | |
colData[fieldIdx] = '\0'; | |
// Check if current column width is larger than any previous | |
if (columnWidths[colNum] < colWidth) { | |
columnWidths[colNum] = colWidth; | |
largestLineNumber[colNum] = rowNum; | |
} | |
char colType = calcColumnType(columnType[colNum], colData); | |
if (colType > 0) { | |
columnType[colNum] = colType; | |
} | |
fieldIdx = 0; // Reset colIdx | |
colNum++; // Move to next column | |
colStartIdx = i + 1; | |
memset(colData, 0, 5000); // Reset colData | |
} | |
break; | |
default: | |
if (isalpha(line[i])) { | |
columnType[colNum] = VARCHAR; | |
}; | |
fieldIdx++; | |
break; | |
} | |
} | |
// Need to handle last column, including determining it's type | |
int colWidth = length - 1 - colStartIdx; | |
if (columnWidths[colNum] < colWidth) { | |
columnWidths[colNum] = colWidth; | |
largestLineNumber[colNum] = rowNum; | |
} | |
char colType = calcColumnType(columnType[colNum], colData); | |
if (colType > 0) { | |
columnType[colNum] = colType; | |
} | |
return colNum; | |
} | |
char calcColumnType(char existingColumnType, char colData[]) { | |
// If field has no length, it's a VARCHAR | |
if (strlen(colData) == 0) { | |
return VARCHAR; | |
} | |
if (existingColumnType != VARCHAR) { // If column is already VARCHAR type, don't need to check | |
if (strchr(colData, '.') != NULL) { | |
return FLOAT; | |
} else { | |
if (existingColumnType != FLOAT) { | |
return INT; | |
} | |
} | |
} | |
return 0; | |
} | |
void checkForErrors(int foundColumns, int numColumns, long rowNum) { | |
static int numErrors = 0; | |
if (numErrors >= MAX_ERRORS) { | |
return; | |
} | |
if (foundColumns + 1 != numColumns) { | |
fprintf(stderr, "Warning: Line %ld has %d columns, expected %d\n", rowNum, foundColumns, numColumns); | |
numErrors++; | |
if (numErrors == MAX_ERRORS) { | |
fprintf(stderr, "Maximum number of errors reached. Further warnings will be suppressed.\n"); | |
} | |
} | |
} | |
void printCreateTable(char columns[MAX_COLUMNS][MAX_COLUMN_NAME_LENGTH], int columnWidths[], char columnType[], long largestLineNumber[], int numColumns) { | |
printf("\n\nCREATE TABLE MYTABLE (\n"); | |
for (int i = 0; i < numColumns; i++) { | |
switch (columnType[i]) { | |
case INT: | |
printf(" %s INT", columns[i]); | |
break; | |
case FLOAT: | |
printf(" %s FLOAT", columns[i]); | |
break; | |
case VARCHAR: | |
printf(" %s VARCHAR(%d)", columns[i], columnWidths[i] == 0 ? DEFAULT_COLUMN_WIDTH : columnWidths[i]); | |
break; | |
default: | |
printf(" %s VARCHAR(%d)", columns[i], columnWidths[i] == 0 ? DEFAULT_COLUMN_WIDTH : columnWidths[i]); | |
break; | |
} | |
if (i < numColumns - 1) { | |
printf(",\n"); | |
} else { | |
printf("\n"); | |
} | |
} | |
printf(");\n"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
Compile with:
gcc csvutil.c -o csvutil
Run with:
./csvutil <filename>
Example output: