Skip to content

Instantly share code, notes, and snippets.

@ender672
Created July 16, 2009 01:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ender672/148089 to your computer and use it in GitHub Desktop.
Save ender672/148089 to your computer and use it in GitHub Desktop.
#include <stdio.h>
enum CHAR_CODE {
col_sep = ',', row_sep = '\n', quote_char = '"'
};
int get_char();
int put_char(int);
void finish_col();
void finish_row();
int main()
{
enum states {
col_start, in_esc_col, in_non_esc_col, esc_col_quote, error
} state;
int c;
state = col_start;
while((c = get_char()) != EOF) {
switch(state) {
case col_start:
switch(c) {
case col_sep: finish_col(); break;
case row_sep: finish_row(); break;
case quote_char: state = in_esc_col; break;
default: put_char(c); state = in_non_esc_col;
}
break;
case in_esc_col:
switch(c) {
case quote_char: state = esc_col_quote; break;
case col_sep:
case row_sep:
default: put_char(c);
}
break;
case in_non_esc_col:
switch(c) {
case col_sep: finish_col(); state = col_start; break;
case row_sep: finish_row(); state = col_start; break;
case quote_char: state = error; break;
default: put_char(c);
}
break;
case esc_col_quote:
switch(c) {
case col_sep: finish_col(); state = col_start; break;
case row_sep: finish_row(); state = col_start; break;
case quote_char: put_char(c); state = in_esc_col; break;
default: state = error;
}
break;
case error: return 1;
}
}
if (state == col_start) return 0;
return 1;
}
int get_char() {
int c = getchar();
if(c == '\r') c = getchar(); // skip carriage returns
return c;
}
int put_char(int c) {
return putchar(c);
}
void finish_col() {
put_char('|');
}
void finish_row() {
finish_col();
put_char('\n');
}
#include <stdio.h>
/* TODO:
* Look for ambiguities in fsm
* Add support for unquoted fields
* Throw errors on bad characters such as \r in unquoted fields
*
* Add support for multibyte, unicode, user-supplied tokens
* This will work by buffering the input in get_char() and looking for
* matching user-supplied multibyte tokens. matches will be replaced with
* special characters, such as 001, 002, etc. The special characters will
* be used for col_sep, row_sep, etc. in the ragel state machine.
* Output will be translated by having put_char() expand those special
* characters back to the user-supplied multibyte tokens.
*
* The same technique should apply to the plain c parser.
*/
int get_char();
int put_char(int);
%%{
machine csv;
action end_column {
put_char('|');
}
action end_row {
put_char('\n');
}
action data_char {
put_char(fc);
}
action escaped_quote {
put_char('"');
}
col_sep = ',';
row_sep = '\r\n';
quote_char = '"';
escaped_quote = (quote_char quote_char) @escaped_quote;
data = (any - quote_char) @data_char;
quoted_field = quote_char (data | escaped_quote)* quote_char @end_column;
row = (quoted_field col_sep?)* row_sep @end_row;
main := row*;
write data;
}%%
int main()
{
int cs;
%% write init;
while (1) {
char c = get_char();
if ( c == EOF )
break;
char *p = &c;
char *pe = p + 1;
%% write exec;
}
return 0;
}
int get_char() {
return getchar();
}
int put_char(int c) {
return putchar(c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment