Skip to content

Instantly share code, notes, and snippets.

@Silva97
Last active June 23, 2022 12:56
Show Gist options
  • Save Silva97/3b967add0dd523e30567892fbb50428e to your computer and use it in GitHub Desktop.
Save Silva97/3b967add0dd523e30567892fbb50428e to your computer and use it in GitHub Desktop.
Example of lexical analysis in C language.
/**
* Example by Luiz Felipe (Silva97)
*
* It's just an example. Don't consider it a final code and DON'T write
* all your code on a unique module, please.
*
* Tip: Use a struct to manipulate translate units instead of use only a
* FILE pointer ;)
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
typedef enum
{
TK_INTEGER = 0,
TK_OPERATOR,
} token_type_t;
typedef struct token_s
{
struct token_s *next;
struct token_s *prev;
char *translate_unit;
unsigned int line;
unsigned int column;
token_type_t type;
char *lexeme;
} token_t;
void show_tokens(token_t *tokens);
token_t *lexer(char *filename, FILE *tu);
char *parse_literal_int(int first_digit, FILE *tu);
int main(int argc, char **argv)
{
if (argc < 2)
{
fputs("Usage: ./lextest <filename>", stderr);
return EXIT_FAILURE;
}
FILE *translate_unit = fopen(argv[1], "r");
if (!translate_unit)
{
perror("Unable to open file.");
return EXIT_FAILURE;
}
token_t *tokens = lexer(argv[1], translate_unit);
if (!tokens)
{
return EXIT_FAILURE;
}
show_tokens(tokens);
}
const char *tk_typename[] = {
[TK_INTEGER] = "integer",
[TK_OPERATOR] = "operator",
};
void show_tokens(token_t *tokens)
{
token_t *current = tokens;
while (current)
{
printf("- lexeme: `%s' | type: %s\n", current->lexeme, tk_typename[current->type]);
current = current->next;
}
}
// -----------------------
#define SET_TOKEN(type_value, lexeme_name) \
type = type_value; \
lexeme = lexeme_name;
token_t *lexer(char *filename, FILE *tu)
{
token_type_t type;
int input;
unsigned int line = 1;
unsigned int column = 1;
char *lexeme;
token_t *current_token = NULL;
token_t *first_token = NULL;
while ((input = fgetc(tu)) != EOF)
{
while (isspace(input))
{
// CAUTION here: The line break is really \n? It's not true on all text formats.
if (input == '\n')
{
line++;
column = 1;
}
else
{
column++;
}
input = fgetc(tu);
}
switch (input)
{
case EOF:
return first_token;
case '+':
SET_TOKEN(TK_OPERATOR, "+");
break;
case '-':
SET_TOKEN(TK_OPERATOR, "-");
break;
case '0' ... '9': // range with ... is a GCC extension
SET_TOKEN(TK_INTEGER, parse_literal_int(input, tu));
break;
default:
fprintf(stderr, "Syntactic error: Character '%c' on %s:%d:%d is invalid.\n", input, filename, line, column);
// CAUTION here: Memory leak because we doesn't free the allocated tokens.
return NULL;
break;
}
token_t *new_token = malloc(sizeof *new_token);
new_token->type = type;
new_token->lexeme = lexeme;
new_token->translate_unit = filename;
new_token->line = line;
new_token->column = column;
new_token->next = NULL;
new_token->prev = current_token;
if (!first_token)
{
first_token = new_token;
}
else
{
current_token->next = new_token;
}
current_token = new_token;
}
return first_token;
}
#define MAX_DIGITS 16
char *parse_literal_int(int first_digit, FILE *tu)
{
char *digits = malloc(MAX_DIGITS + 1);
char *current_char = digits;
int input;
*current_char++ = first_digit;
for (int i = 1; isdigit((input = fgetc(tu))); i++)
{
if (i > MAX_DIGITS)
{
fputs("Literal number exceeded maximum size.\n", stderr);
exit(EXIT_FAILURE);
}
*current_char++ = input;
}
if (input != EOF)
{
ungetc(input, tu);
}
*current_char = '\0';
return digits;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment