Created
July 16, 2009 01:10
-
-
Save ender672/148089 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
enum CHAR_CODE { | |
col_sep = ',', row_sep = '\n', quote_char = '"' | |
}; | |
int get_char(); | |
int put_char(int); | |
void finish_col(); | |
void finish_row(); | |
int main() | |
{ | |
enum states { | |
col_start, in_esc_col, in_non_esc_col, esc_col_quote, error | |
} state; | |
int c; | |
state = col_start; | |
while((c = get_char()) != EOF) { | |
switch(state) { | |
case col_start: | |
switch(c) { | |
case col_sep: finish_col(); break; | |
case row_sep: finish_row(); break; | |
case quote_char: state = in_esc_col; break; | |
default: put_char(c); state = in_non_esc_col; | |
} | |
break; | |
case in_esc_col: | |
switch(c) { | |
case quote_char: state = esc_col_quote; break; | |
case col_sep: | |
case row_sep: | |
default: put_char(c); | |
} | |
break; | |
case in_non_esc_col: | |
switch(c) { | |
case col_sep: finish_col(); state = col_start; break; | |
case row_sep: finish_row(); state = col_start; break; | |
case quote_char: state = error; break; | |
default: put_char(c); | |
} | |
break; | |
case esc_col_quote: | |
switch(c) { | |
case col_sep: finish_col(); state = col_start; break; | |
case row_sep: finish_row(); state = col_start; break; | |
case quote_char: put_char(c); state = in_esc_col; break; | |
default: state = error; | |
} | |
break; | |
case error: return 1; | |
} | |
} | |
if (state == col_start) return 0; | |
return 1; | |
} | |
int get_char() { | |
int c = getchar(); | |
if(c == '\r') c = getchar(); // skip carriage returns | |
return c; | |
} | |
int put_char(int c) { | |
return putchar(c); | |
} | |
void finish_col() { | |
put_char('|'); | |
} | |
void finish_row() { | |
finish_col(); | |
put_char('\n'); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
/* TODO: | |
* Look for ambiguities in fsm | |
* Add support for unquoted fields | |
* Throw errors on bad characters such as \r in unquoted fields | |
* | |
* Add support for multibyte, unicode, user-supplied tokens | |
* This will work by buffering the input in get_char() and looking for | |
* matching user-supplied multibyte tokens. matches will be replaced with | |
* special characters, such as 001, 002, etc. The special characters will | |
* be used for col_sep, row_sep, etc. in the ragel state machine. | |
* Output will be translated by having put_char() expand those special | |
* characters back to the user-supplied multibyte tokens. | |
* | |
* The same technique should apply to the plain c parser. | |
*/ | |
int get_char(); | |
int put_char(int); | |
%%{ | |
machine csv; | |
action end_column { | |
put_char('|'); | |
} | |
action end_row { | |
put_char('\n'); | |
} | |
action data_char { | |
put_char(fc); | |
} | |
action escaped_quote { | |
put_char('"'); | |
} | |
col_sep = ','; | |
row_sep = '\r\n'; | |
quote_char = '"'; | |
escaped_quote = (quote_char quote_char) @escaped_quote; | |
data = (any - quote_char) @data_char; | |
quoted_field = quote_char (data | escaped_quote)* quote_char @end_column; | |
row = (quoted_field col_sep?)* row_sep @end_row; | |
main := row*; | |
write data; | |
}%% | |
int main() | |
{ | |
int cs; | |
%% write init; | |
while (1) { | |
char c = get_char(); | |
if ( c == EOF ) | |
break; | |
char *p = &c; | |
char *pe = p + 1; | |
%% write exec; | |
} | |
return 0; | |
} | |
int get_char() { | |
return getchar(); | |
} | |
int put_char(int c) { | |
return putchar(c); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment