Skip to content

Instantly share code, notes, and snippets.

@8dcc
Created November 18, 2023 15:53
Show Gist options
  • Save 8dcc/9e3d9da86b113cd54894b0a56b6548e7 to your computer and use it in GitHub Desktop.
Save 8dcc/9e3d9da86b113cd54894b0a56b6548e7 to your computer and use it in GitHub Desktop.
Slightly modified syntax highlighter from StackOverflow
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
/*
* C syntax highlighter to stdout.
* Source: https://stackoverflow.com/a/77465783/11715554
*/
// XXX: no support for UTF-8 identifiers
// XXX: no support for universal-character-name in identifiers
enum c_token_type {
END,
WHITESPACE,
NEWLINE,
COMMENT,
PREPROCESSOR,
KEYWORD,
IDENTIFIER,
STRING,
CHARCONST,
NUMBER,
OPERATOR,
CTYPE,
FUNCALL,
OTHER,
ERROR,
};
struct c_parse_context {
const char* filename;
const char* source;
const char* p;
const char* token_start;
char token_string[80];
int line_number;
int column_number;
int at_bol;
int in_preprocess;
//...
};
static const char c_keywords[] =
" auto break case const continue default do else enum extern for "
" goto if inline register restrict return sizeof static struct "
" switch typedef union volatile while "
/* types */
" char double float int long unsigned short signed void "
/* C99 and C11 keywords */
" _Alignas _Alignof _Atomic _Generic _Noreturn _Static_assert "
" _Thread_local "
/* C99 and C11 types */
" _Bool _Complex _Imaginary "
/* C23 keywords */
" alignas alignof constexpr false nullptr static_assert thread_local "
" true typeof typeof_unqual "
/* C23 types */
" bool _BitInt _Decimal128 _Decimal32 _Decimal64 ";
static const char c_types[] =
/* types */
" char double float int long unsigned short signed void va_list "
/* C99 and C11 types */
" _Bool _Complex _Imaginary "
/* C23 types */
" bool _BitInt _Decimal128 _Decimal32 _Decimal64 "
/* common standard types */
" FILE va_list ";
static const char c_punctuators[] =
" [ ] ( ) { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | "
" && || ? : :: ; ... = *= /= %= += -= <<= >>= &= ^= |= , # ## "
" <: :> <% %> %: %:%: ";
static int c_find_word(const char* words, const char* s, int len) {
for (const char* p = words; (p = strchr(p, *s)) != NULL; p++)
if (p[-1] == ' ' && !strncmp(p, s, len) && p[len] == ' ')
return 1;
return 0;
}
static int c_is_keyword(const char* s, int len) {
return c_find_word(c_keywords, s, len);
}
static int c_is_type(const char* s, int len) {
return c_find_word(c_types, s, len) ||
(len > 2 && s[len - 2] == '_' && s[len - 1] == 't');
}
static int c_getc(struct c_parse_context* pc) {
for (;;) {
int c = (unsigned char)*pc->p++;
if (c == '\0') {
pc->p--;
return 0;
}
if (c == '\r') { // convert end of line sequences to '\n'
if (*pc->p == '\n')
pc->p += 1;
return '\n';
}
#if 0 // trigraphs can be handled here.
if (c == '?' && *pc->p == '?') {
switch (pc->p[1]) {
case '=': c = '#'; pc->p += 2; break;
case '(': c = '['; pc->p += 2; break;
case '/': c = '\\'; pc->p += 2; break;
case ')': c = ']'; pc->p += 2; break;
case '\’': c = '^'; pc->p += 2; break;
case '<': c = '{'; pc->p += 2; break;
case '!': c = '|'; pc->p += 2; break;
case '>': c = '}'; pc->p += 2; break;
case '-': c = '~'; pc->p += 2; break;
}
}
#endif
if (c == '\\') { // remove escaped newlines
if (*pc->p == '\n') {
pc->p++;
continue;
}
if (*pc->p == '\r') {
pc->p++;
if (*pc->p == '\n')
pc->p++;
continue;
}
}
return c;
}
}
static int c_peekc(struct c_parse_context* pc) {
const char* start = pc->p;
int c = c_getc(pc);
pc->p = start;
return c;
}
static int c_peekc2(struct c_parse_context* pc) {
const char* start = pc->p;
int c = c_getc(pc);
c = c_getc(pc);
pc->p = start;
return c;
}
static inline int c_isalnum_(int c) {
return isalnum(c) || c == '_';
}
static int c_parse_number(struct c_parse_context* pc, int lastc) {
/* parse a pp-number, the grammar allows for invalid numbers */
int c;
for (; (c = c_peekc(pc)) != '\0'; lastc = c, c_getc(pc)) {
if (!c_isalnum_(c) && c != '.') {
if (c == '+' || c == '-') {
if (!memchr("eEpP", lastc, 4))
break;
} else if (c == '\'') { // C23 digit separators
if (!c_isalnum_(c_peekc2(pc)))
break;
} else {
break;
}
}
}
return NUMBER;
}
static int c_parse_string(struct c_parse_context* pc, int sep) {
int c;
while ((c = c_peekc(pc)) != '\0' && c != '\n') {
c_getc(pc);
if (c == sep)
return (sep == '\'') ? CHARCONST : STRING;
if (c == '\\' && c_getc(pc) == '\0')
break;
}
// unterminated string or character constant
return ERROR;
}
static int c_parse_operator(struct c_parse_context* pc, int c) {
const char* save[4];
size_t len = 0;
for (size_t i = 0;;) {
pc->token_string[i] = (char)c;
save[i] = pc->p;
i++;
if (c_find_word(c_punctuators, pc->token_string, i))
len = i;
if (i == 4 || !ispunct(c = c_getc(pc)))
break;
}
if (len) {
pc->p = save[len - 1];
pc->token_string[len] = '\0';
return OPERATOR;
} else {
pc->p = save[0];
return OTHER;
}
}
static int c_parse_comment1(struct c_parse_context* pc) {
int c;
while ((c = c_peekc(pc)) != '\0' && c != '\n')
c_getc(pc);
return COMMENT;
}
static int c_parse_comment2(struct c_parse_context* pc) {
int c;
while ((c = c_peekc(pc)) != '\0') {
c_getc(pc);
if (c == '*' && c_peekc(pc) == '/') {
c_getc(pc);
return COMMENT;
}
}
// unterminated comment
return ERROR;
}
static int c_parse_identifier(struct c_parse_context* pc, int c) {
size_t len = 0;
pc->token_string[len++] = (char)c;
while (c_isalnum_(c = c_peekc(pc))) {
if (len < sizeof(pc->token_string) - 1)
pc->token_string[len++] = (char)c;
c_getc(pc);
}
pc->token_string[len] = '\0';
if (c_is_type(pc->token_string, len))
return CTYPE;
if (c_is_keyword(pc->token_string, len))
return KEYWORD;
if (isblank(c))
c = c_peekc2(pc);
if (c == '(')
return FUNCALL;
return IDENTIFIER;
}
enum c_token_type c_get_token(struct c_parse_context* pc) {
int c, c1, c2;
pc->token_start = pc->p;
c = c_getc(pc);
if (isspace(c)) {
if (c == '\n') {
pc->at_bol = 1;
pc->in_preprocess = 0;
return NEWLINE;
}
while (memchr(" \t\f\v", c_peekc(pc), 4))
c_getc(pc);
return WHITESPACE;
}
if (pc->at_bol) {
pc->at_bol = 0;
if (c == '#') {
pc->at_bol = 0;
pc->in_preprocess = 1;
return PREPROCESSOR;
}
}
switch (c) {
case '\0':
return END;
case '/':
if (c_peekc(pc) == '/') {
c_getc(pc);
return c_parse_comment1(pc);
}
if (c_peekc(pc) == '*') {
c_getc(pc);
return c_parse_comment2(pc);
}
break;
case '.':
if (isdigit(c_peekc(pc)))
return c_parse_number(pc, c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return c_parse_number(pc, c);
case '\'':
case '\"':
return c_parse_string(pc, c);
case 'L':
case 'U':
if ((c1 = c_peekc(pc)) == '\'' || c1 == '\"')
return c_parse_string(pc, c_getc(pc));
break;
case 'u':
if ((c1 = c_peekc(pc)) == '8' &&
((c2 = c_peekc2(pc)) == '\'' || c2 == '\"')) {
c_getc(pc);
return c_parse_string(pc, c_getc(pc));
}
if (c1 == '\'' || c1 == '\"')
return c_parse_string(pc, c_getc(pc));
break;
}
// XXX: should handle UTF-8 and universal-character-name here
if (c_isalnum_(c))
return c_parse_identifier(pc, c);
return c_parse_operator(pc, c);
}
// C token colorizer
// ANSI colors
#define RESET "\033[0m"
#define BLACK "\033[30m"
#define RED "\033[31m"
#define GREEN "\033[32m"
#define YELLOW "\033[33m"
#define BLUE "\033[34m"
#define MAGENTA "\033[35m"
#define CYAN "\033[36m"
#define WHITE "\033[37m"
#define GREY "\033[90m"
#define BRIGHT_RED "\033[91m"
#define BRIGHT_GREEN "\033[92m"
#define BRIGHT_YELLOW "\033[93m"
#define BRIGHT_BLUE "\033[94m"
#define BRIGHT_MAGENTA "\033[95m"
#define BRIGHT_CYAN "\033[96m"
#define BRIGHT_WHITE "\033[97m"
#define DEFAULT BRIGHT_GREEN
const char* const c_colors[] = {
[END] = RESET,
[WHITESPACE] = DEFAULT,
[NEWLINE] = RESET,
[COMMENT] = WHITE,
[PREPROCESSOR] = CYAN,
[KEYWORD] = BRIGHT_WHITE,
[IDENTIFIER] = DEFAULT,
[STRING] = BRIGHT_CYAN,
[CHARCONST] = BRIGHT_CYAN,
[NUMBER] = GREEN,
[OPERATOR] = DEFAULT,
[CTYPE] = BRIGHT_MAGENTA,
[FUNCALL] = BRIGHT_YELLOW,
[OTHER] = RED,
[ERROR] = RED,
};
void c_colorize(const char* filename, const char* source) {
struct c_parse_context ctx = {
filename, source, source, NULL, { 0 }, 1, 1, 1, 0,
};
enum c_token_type last_color = END;
for (;;) {
enum c_token_type tok_type = c_get_token(&ctx);
const char* s = ctx.token_start;
int len = ctx.p - s;
enum c_token_type color = tok_type;
if (ctx.in_preprocess && tok_type != COMMENT)
color = PREPROCESSOR;
if (last_color != color) {
if (c_colors[color])
fputs(c_colors[color], stdout);
last_color = color;
}
if (tok_type == END)
break;
printf("%.*s", len, s);
s += len;
}
}
char* load_file(const char* filename, FILE* fp) {
char buf[4096];
char* source = NULL;
size_t len = 0;
size_t nread;
FILE* fp_close = NULL;
if (fp == NULL) {
fp = fopen(filename, "r");
if (fp == NULL) {
fprintf(stderr, "cannot open %s: %s\n", filename, strerror(errno));
return NULL;
}
fp_close = fp;
}
while ((nread = fread(buf, 1, sizeof buf, fp)) > 0) {
char* new_buf = realloc(source, len + nread + 1);
if (new_buf == NULL) {
fprintf(stderr, "out of memory for %s\n", filename);
free(source);
source = NULL;
break;
}
source = new_buf;
memcpy(source + len, buf, nread);
len += nread;
source[len] = '\0';
}
if (fp_close)
fclose(fp_close);
return source;
}
int main(int argc, char* argv[]) {
if (argc > 1) {
for (int i = 1; i < argc; i++) {
char* source = load_file(argv[i], NULL);
if (!source)
continue;
c_colorize(argv[i], source);
free(source);
}
} else {
char* source = load_file("<stdin>", stdin);
if (source) {
c_colorize("<stdin>", source);
free(source);
}
}
return 0;
}
@8dcc
Copy link
Author

8dcc commented Nov 18, 2023

Revision number: 5

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment