Skip to content

Instantly share code, notes, and snippets.

@wernsey
Last active July 3, 2022 15:09
Show Gist options
  • Save wernsey/721865a68d15e32281ffb6e295bc54b8 to your computer and use it in GitHub Desktop.
Save wernsey/721865a68d15e32281ffb6e295bc54b8 to your computer and use it in GitHub Desktop.
CSV stream reader - single header library
#include <stdio.h>
#include <string.h>
#include <errno.h>
#define CSV_STATIC
#define CSV_IMPLEMENTATION
#include "csvstrm.h"
int main(int argc, char *argv[]) {
CsvContext csv;
FILE *f;
if(argc < 2) {
fprintf(stderr, "CSV file expected\n");
return 1;
}
f = fopen(argv[1], "r");
if(!f) {
fprintf(stderr, "Unable to open '%s': %s\n", argv[1], strerror(errno));
return 1;
}
#if 1
csv_context_file(&csv, f);
#else
struct csv_read_limit ll;
ll.f = f;
ll.limit = 41;
csv_context_file_limit(&csv, &ll);
#endif
while(csv_read_record(&csv)) {
int j;
if(csv_get_error(&csv) != CSV_OK) {
fprintf(stderr, "error: %d\n", csv_get_error(&csv));
break;
}
for(j = 0; j < csv_count(&csv); j++) {
printf("[%s]", csv_field(&csv,j));
}
printf("\n");
}
fclose(f);
return 0;
}
#ifndef CSV_STREAM_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* **CSV stream reader**
*
* Single header library to read a [CSV file][wiki] row-by-row.
*
* To use this library, define `CSV_IMPLEMENTATION` before including
* **csvstrm.h** in _one_ of your C files (other C files include
* **csvstrm.h** normally), like so:
*
* ```c
* #include <stdio.h>
*
* #define CSV_IMPLEMENTATION
* #include "csvstrm.h"
* ```
*
* It will parse CSV documents as specified by [RFC4180][RFC], but it
* follows the rule of _"be liberal in what you accept from others"_,
* so there are a couple of deviations:
*
* * Leading and trailing whitespaces in each field are trimmed by default.
* * This behaviour can be changed by defining `CSV_TRIM` as 0.
* * You can have spaces before and after the quotes in a quoted field.
* * Double quotes inside unquoted fields are allowed.
* * Records can end with CRLF or with LF character sequences.
* * It does not enforce that all records (rows) have the same number of
* fields. That is an application concern.
* * It does not specify whether the first row contains headers. That is
* left up to the application.
*
* It also took some ideas from the [Repici][] document cited by the [RFC][].
*
* The documentation for this file is generated by extracting the comments
* into a [Markdeep][] document. Here is an [Awk script][doc-script] that
* does this.
*
* # Basic usage example
*
* Here is a simple usage example. Some error handling code has been omitted.
*
* ```c
* int j;
* CsvContext csv;
* FILE *f = fopen(argv[1], "r");
*
* // Call csv_context_file() to initialise the CsvContext object
* // to read read CSV data from an open file.
* csv_context_file(&csv, f);
*
* // csv_read_record() reads a row from the file.
* // It will return 0 when it reaches the end of the file
* while(csv_read_record(&csv)) {
* // You can use csv_count() to retrieve the number of fields
* // read from the file.
* // csv_field() can then be used to access an individual field.
* for(j = 0; j < csv_count(&csv); j++) {
* printf("[%s]", csv_field(&csv,j));
* }
* printf("\n");
* }
* fclose(f);
* ```
*
* # License
*
* This code is dedicated to the public domain by the author, Werner Stoop.
*
* If, for some reason, you cannot use the above public domain dedication
* then [the following license][fsfap] can be applied:
*
* ```txt
* (c) 2022 Werner Stoop
* Copying and distribution of this file, with or without modification,
* are permitted in any medium without royalty provided the copyright
* notice and this notice are preserved. This file is offered as-is,
* without any warranty.
* ```
*
* [wiki]: https://en.wikipedia.org/wiki/Comma-separated_values
* [RFC]: https://datatracker.ietf.org/doc/html/rfc4180
* [Repici]: https://www.creativyst.com/Doc/Articles/CSV/CSV01.shtml
* [fsfap]: https://en.wikipedia.org/wiki/GNU_All-permissive_License
* [Markdeep]: https://casual-effects.com/markdeep/
* [doc-script]: https://gist.github.com/wernsey/de253d42a8df6f3b21358e4b5422b955
*/
/**
* # Configuration
*
* These macros can be defined before including **csvstrm.h** in your
* C file to control the behaviour of the library.
*
* `CSV_DELIMITER`
* : The delimiter to separate fields (columns) in each record (row)
* It defaults to `','`.
* `CSV_BUFFER_SIZE`
* : While each record is being read, the characters from the file
* are copied to an internal buffer. This controls the size of
* that internal buffer.
* `CSV_READ_BUFFER_SIZE`
* : This controls the size of the second internal buffer that
* stores raw bytes as they are read from the input before they're
* processed.
* `CSV_MAX_FIELDS`
* : The maximum number of fields expected per record.
* `CSV_TRIM`
* : Determines whether leading and trailing whitespace characters will
* be trimmed from fields by the parser.
* For example, consider a CSV section `..., foo ,...`. If `CSV_TRIM` is
* non-zero the field will be returned as `"foo"`. It it is 0 then the
* whitespace will be left intact, so it will be returned as `" foo "`
*
* These macros _must_ be the same in all files that includes **csvstrm.h**.
*/
# ifndef CSV_DELIMITER
# define CSV_DELIMITER ','
# endif
# ifndef CSV_BUFFER_SIZE
# define CSV_BUFFER_SIZE 256
# endif
# ifndef CSV_READ_BUFFER_SIZE
# define CSV_READ_BUFFER_SIZE 64
# endif
# ifndef CSV_MAX_FIELDS
# define CSV_MAX_FIELDS 32
# endif
# ifndef CSV_TRIM
# define CSV_TRIM 1
# endif
/**
* # Definitions
*
* ## `csv_read_data_fun`
*
* `typedef int (*csv_read_data_fun)(char *b, int n, void *d);`
*
* Prototype for functions that can read CSV data.
*
* `b` is a pointer to a buffer that will be filled with chars from
* the input files. `n` contains the size in bytes of the buffer.
* `d` is a pointer to some structure where the data is read from.
*
* For example, when reading a CSV file from a ZIP archive, `d` might
* point to the structure that the ZIP library to encapsulate the archive.
*
* The function should return 0 if it reaches the end of the input data,
* non-zero otherwise.
*
* See `csv_context_custom()` in Section [initialising the csvcontext]
*/
typedef int (*csv_read_data_fun)(char *b, int n, void *d);
/**
* ## `enum csv_error_code`
*
* `CSV_OK`
* : No error
* `CSV_ERR_BUFFER`
* : The buffer used to store field data internally is full.
* It is too small for the record (row) you're reading.
* Increase `CSV_BUFFER_SIZE`.
* `CSV_ERR_FIELDS`
* : There are too many fields (columns) in the record.
* Increase `CSV_MAX_FIELDS`.
* `CSV_ERR_BAD_QUOTE`
* : A quoted field is incorrectly formatted.
* `CSV_ERR_LINE_END`
* : There is a problem with a line ending.
*
*/
enum csv_error_code {
CSV_OK = 0,
CSV_ERR_BUFFER, /* increase CSV_BUFFER_SIZE */
CSV_ERR_FIELDS, /* increase CSV_MAX_FIELDS */
CSV_ERR_BAD_QUOTE,
CSV_ERR_LINE_END,
};
/**
* ## `typedef struct CsvContext CsvContext;`
*
* Structure that contains the state of the CSV stream parser.
*
* The fields in the structure should not be manipulated directly,
* but these are some members of interest:
*
* `char *fields[CSV_MAX_FIELDS]`
* : The array of pointers that contain the fields after parsing a record.
* Rather use `csv_field()` to access the individual fields.
* `int nf`
* : The number of fields parsed from a record.
* Rather use `csv_count()` to read this value.
* `enum csv_error_code err`
* : An error code that may have resulted from parsing the record.
* Rather use `csv_get_error()` to retrieve this value.
*
* Section [initialising the csvcontext] below describes how to
* initialise the structure to read CSV data.
*/
typedef struct CsvContext {
/* Determines where the data is read from */
csv_read_data_fun get_data;
void *data;
/* The internal buffer, where bytes are read into
from the file, but before they're processed. */
char raw_buffer[CSV_READ_BUFFER_SIZE];
int in_pos;
int last_char;
/* Where the data for the fields are stored.
The values in `fields` are a pointers into this buffer */
char buffer[CSV_BUFFER_SIZE];
/* The fields that have been parsed from the file */
char *fields[CSV_MAX_FIELDS];
int nf;
/* Error code? */
enum csv_error_code err;
} CsvContext;
#ifdef EOF /* EOF will be defined if <stdio.h> is #included */
/**
* # Initialising the `CsvContext`
*
* `void csv_context_file(CsvContext *csv, FILE *file)`
*
* Initialises a `CsvContext` structure to read data from a file
* pointed to by `file`.
*/
void csv_context_file(CsvContext *csv, FILE *file);
/**
* `void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll)`
*
* Initialises a `CsvContext` structure to read data from a file, but it will
* only read a limited number of bytes from the file.
*
* (The intended use-case is where a CSV file has been concatenated with other
* files into an archive file)
*
* The `csv_read_limit` structure is defined as follows:
*
* ```
* struct csv_read_limit {
* FILE *f;
* int limit;
* };
* ```
*
* where `f` is the file to read from and `limit` is the maximum
* number of bytes that will be read from a file.
*/
struct csv_read_limit {
FILE *f;
int limit;
};
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll);
#endif
/**
* `void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data)`
*
* Initialises a `CsvContext` with a custom function `fun` that will read bytes
* from an object `data`.
*/
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data);
/**
* # Reading records
*
* `int csv_read_record(CsvContext *csv)`
*
* Reads a record from the CSV file.
*
* It returns the number of fields that were read from the record.
* If the number of fields does not match the number of fields expected
* then `csv_get_error()` can be used to retrieve the error code.
*
* `int csv_count(CsvContext *csv)`
*
* Get the number of fields in the last record that was read by
* `csv_read_record()`.
*
* `const char *csv_field(CsvContext *csv, int i)`
*
* Get the `i`'th field of the last record that was read by
* `csv_read_record()`.
*
* `enum csv_error_code csv_get_error(CsvContext *csv)`
*
* Retrieves an error code (if any) from the `CsvContext`.
* The error codes are described in Subsection [enum csv_error_code].
*/
int csv_read_record(CsvContext *csv);
int csv_count(CsvContext *csv);
const char *csv_field(CsvContext *csv, int i);
enum csv_error_code csv_get_error(CsvContext *csv);
/* *********************************************************************** */
# ifdef CSV_IMPLEMENTATION
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#ifdef __cplusplus
# define CAST(x, y) (x)y
#else
# define CAST(x, y) y
#endif
static int get_char(CsvContext *csv) {
char c = 0;
if(csv->last_char == EOF) {
return EOF;
} else if(csv->last_char) {
c = csv->last_char;
csv->last_char = 0;
return c;
}
if(csv->in_pos >= CSV_READ_BUFFER_SIZE || (c = csv->raw_buffer[csv->in_pos++]) == '\0') {
int cnt = csv->get_data(csv->raw_buffer, CSV_READ_BUFFER_SIZE, csv->data);
if(!cnt) {
csv->last_char = EOF;
return EOF;
}
csv->in_pos = 0;
c = csv->raw_buffer[csv->in_pos++];
}
return c;
}
static void unget_char(CsvContext *csv, int c) {
csv->last_char = c;
}
int csv_read_record(CsvContext *csv) {
int c = 0;
size_t start, bump = 0;
enum parse_state {
RECORD_START, FIELD_START, FIELD, QUOTE, FIELD_END, RECORD_END
} state = RECORD_START;
if(csv->last_char == EOF) return 0;
csv->nf = 0;
csv->err = CSV_OK;
for(;;) {
switch(state) {
case RECORD_START:
c = get_char(csv);
if(c == EOF)
return 0;
state = FIELD_START;
unget_char(csv, c);
break;
case FIELD_START:
if(csv->nf == CSV_MAX_FIELDS) {
csv->err = CSV_ERR_FIELDS;
return csv->nf;
}
c = get_char(csv);
#if CSV_TRIM
while(strchr(" \t\v\f", c))
c = get_char(csv);
#endif
csv->fields[csv->nf] = &csv->buffer[bump];
if(c == '\"')
state = QUOTE;
else {
unget_char(csv, c);
start = bump;
state = FIELD;
}
break;
case FIELD:
c = get_char(csv);
if(c == '\r') {
c = get_char(csv);
if(c != '\n') {
csv->err = CSV_ERR_LINE_END;
return csv->nf;
}
}
if(c == EOF || c == '\n' || c == CSV_DELIMITER) {
#if CSV_TRIM
while(bump > start && strchr(" \t\v\f", csv->buffer[bump-1]))
bump--;
#endif
state = c == CSV_DELIMITER ? FIELD_END : RECORD_END;
} else {
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = c;
}
break;
case QUOTE:
c = get_char(csv);
if(c == EOF) {
csv->err = CSV_ERR_BAD_QUOTE;
return csv->nf;
}
if(c == '\"') {
c = get_char(csv);
if(c != '\"') {
#if CSV_TRIM
while(strchr(" \t\v\f", c))
c = get_char(csv);
#endif
if(c == EOF || c == '\n') {
state = RECORD_END;
} else if(c == CSV_DELIMITER) {
state = FIELD_END;
} else {
csv->err = CSV_ERR_BAD_QUOTE;
return csv->nf;
}
break;
}
}
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = c;
break;
case FIELD_END:
case RECORD_END:
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = '\0';
csv->nf++;
if(state == RECORD_END)
return csv->nf;
else
state = FIELD_START;
break;
}
}
/*return 0;*/
}
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data) {
csv->get_data = fun;
csv->data = data;
csv->last_char = 0;
csv->in_pos = CSV_READ_BUFFER_SIZE;
csv->nf = 0;
csv->err = CSV_OK;
}
static int file_input_get_line(char *str, int num, void *data) {
size_t read;
FILE *file = CAST(FILE*, data);
if(feof(file))
return 0;
read = fread(str, 1, num-1, file);
str[read] = '\0';
if(!read)
return 0;
return 1;
}
void csv_context_file(CsvContext *csv, FILE *file) {
assert(file);
csv_context_custom(csv, file_input_get_line, file);
}
static int file_input_get_line_limit(char *str, int num, void *data) {
size_t read;
struct csv_read_limit *ll = CAST(struct csv_read_limit *, data);
if(!ll->limit) return 0;
num--;
if(num > ll->limit)
num = ll->limit;
read = fread(str, 1, num, ll->f);
str[read] = '\0';
if(!read)
return 0;
ll->limit -= strlen(str);
return 1;
}
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll) {
assert(ll->f);
assert(ll->limit > 0);
csv_context_custom(csv, file_input_get_line_limit, ll);
}
int csv_count(CsvContext *csv) {
return csv->nf;
}
const char *csv_field(CsvContext *csv, int i) {
if(i < 0 || i >= csv->nf) return "";
return csv->fields[i];
}
enum csv_error_code csv_get_error(CsvContext *csv) {
return csv->err;
}
# endif /* CSV_IMPLEMENTATION */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CSV_STREAM_H */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment