Blog 2020/1/10
<- previous | index | next ->
Let's learn how to write a Lisp interpreter in C!
In part 5, we improve our string implementation by adding (basic) support for escaped characters.
We need to update fbuff_get_token_str()
to correctly detect the end of the string
despite any \
characters at the end of the string.
If there are an even number of \
before the trailing "
,
then we've reached the end of the string.
cursor++;
}
+ unsigned int backslash_count = 0;
while (true) {
size_t len = cursor - buffp;
@@ -188,7 +196,7 @@
return err;
/* this is the end of the string. */
- } else if (ch == '"') {
+ } else if (ch == '"' && is_even(backslash_count)) {
*cursor = ch;
cursor++;
*cursor = '\0';
@@ -209,6 +217,14 @@
/* this is a regular char. */
} else {
+ /* track the number of consecutive backslashes so we can
+ disambiguate the closing quote. */
+ if (ch == '\\') {
+ backslash_count++;
+ } else {
+ backslash_count = 0;
+ }
+
*cursor = ch;
cursor++;
}
/* Is u even? */
static bool is_even(unsigned int u) {
/* if the LSB isn't set, u is even. */
return !(u & 0x1);
}
Rather than naively calling strncpy()
,
we update parse_string()
to process character-by-character
and handle \
as a special-case.
We also need to implement a way to detect escaped characters and unescape them.
return err;
}
- /* don't copy the closing quote. */
- size_t dst_len = dst_size - 1;
- /* don't copy the opening quote. */
- const char* start = buffp + 1;
- strncpy(dst, start, dst_len);
- *(dst + dst_len) = '\0';
+ /* skip the opening quote. */
+ const char* src_first = buffp + 1;
+ /* skip the closing quote. */
+ const char* src_last = buffp + src_len - 2;
+
+ const char* src_cursor = src_first;
+ char* dst_cursor = dst;
+
+ while (src_cursor <= src_last) {
+ /* this is possibly an escape sequence. */
+ if (*src_cursor == '\\') {
+ src_cursor++;
+
+ /* the last char of the string is a backslash, which is invalid. */
+ if (src_cursor > src_last) {
+ free(dst);
+ return E_parse_string__invalid_string_3;
+
+ /* this is an escape sequence. */
+ } else if (is_escapechar(*src_cursor)) {
+ *dst_cursor = unescape_char(*src_cursor);
+ src_cursor++;
+ dst_cursor++;
+ continue;
+
+ /* this is an invalid escape sequence. */
+ } else {
+ free(dst);
+ return E_parse_string__invalid_string_4;
+ }
+
+ /* this is just a regular char. */
+ } else {
+ *dst_cursor = *src_cursor;
+ src_cursor++;
+ dst_cursor++;
+ continue;
+ }
+ }
+ *dst_cursor = '\0';
*spp = dst;
return 0;
}
/* Is ch in the list of escapable chars? */
static bool is_escapechar(char ch) {
char* found = strchr("abefnrtv?'\"\\", (int)ch);
return found != NULL;
}
/* Returns the "unescaped" char corresponding to the given escape char.
E.g. if esc is 'n', a newline character is returned.
Asserts false if esc is not a valid escape char. */
static char unescape_char(char esc) {
if (esc == 'a') {
return '\a';
} else if (esc == 'b') {
return '\b';
} else if (esc == 'e') {
return '\e';
} else if (esc == 'f') {
return '\f';
} else if (esc == 'n') {
return '\n';
} else if (esc == 'r') {
return '\r';
} else if (esc == 't') {
return '\t';
} else if (esc == 'v') {
return '\v';
} else if (esc == '\\') {
return '\\';
} else if (esc == '\'') {
return '\'';
} else if (esc == '"') {
return '"';
} else if (esc == '?') {
return '?';
} else {
assert(false);
}
}
We updated the reader to unescape escaped characters. Conversely, we update the printer to unescape escaped characters.
/* Prints the CString in csp into fp.
Returns 0 or errno. */
static int print_cstring(CString* csp, FILE* fp) {
- int err = fprintf(fp, "CString: \"%s\"", csp->valuep);
+ char* esc;
+ escape_str(csp->valuep, &esc);
+ fprintf(fp, "CString: \"%s\"", esc);
return 0;
}
/* Escapes srcp into a malloc'ed dstpp.
Returns 0 or errno. */
static int escape_str(char* srcp, char** dstpp) {
size_t src_len = strlen(srcp);
size_t src_size = src_len + 1;
/* dst will be worst-case twice as large (every byte becomes two bytes),
so start there, then shrink to fit at the end. */
size_t dst_size = src_size * 2;
char* dstp = malloc(dst_size);
char* src_cursor = srcp;
char* src_last = srcp + src_len - 1;
char* dst_cursor = dstp;
size_t dst_len = 0;
while(src_cursor <= src_last) {
char ch = *src_cursor;
if (is_unescaped(ch)) {
*dst_cursor = '\\';
dst_cursor++;
*dst_cursor = escape_char(ch);
} else {
*dst_cursor = ch;
}
src_cursor++;
dst_cursor++;
dst_len++;
}
/* shrink-to-fit. */
size_t newdst_size = dst_len + 1;
char* newdstp = realloc(dstp, newdst_size);
dstp = newdstp;
*dstpp = dstp;
return 0;
}
/* Is ch an unescaped char? */
bool is_unescaped(char ch) {
char* found = strchr("\a\b\e\f\n\r\t\v\\\"", (int)ch);
return found != NULL;
}
/* Returns the escaped version of unesc.
For example, if unesc is a newline, 'n' is returned.
Asserts false if unesc is not a valid escape char. */
static char escape_char(char unesc) {
if (unesc == '\a') {
return 'a';
} else if (unesc == '\b') {
return 'b';
} else if (unesc == '\e') {
return 'e';
} else if (unesc == '\f') {
return 'f';
} else if (unesc == '\n') {
return 'n';
} else if (unesc == '\r') {
return 'r';
} else if (unesc == '\t') {
return 't';
} else if (unesc == '\v') {
return 'v';
} else if (unesc == '\\') {
return '\\';
} else if (unesc == '"') {
return '"';
} else {
assert(false);
}
}
Let's revisit two examples from last time:
$ ./lisp
> "I said \"Hello!\" to the baker."
CString: "I said \"Hello!\" to the baker."
>
$ ./lisp
> "foo
bar"
CString: "foo\nbar"
>
Jackpot!
In part 6 we will implement support for lists!