-
-
Save FROGGS/923b97f208ddb4fef181 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/io/syncfile.c b/src/io/syncfile.c | |
index d951a73..6342dae 100644 | |
--- a/src/io/syncfile.c | |
+++ b/src/io/syncfile.c | |
@@ -35,8 +35,8 @@ typedef struct { | |
/* Decode stream, for turning bytes from disk into strings. */ | |
MVMDecodeStream *ds; | |
- /* Current separator codepoint. */ | |
- MVMGrapheme32 sep; | |
+ /* Current separator(s). */ | |
+ MVMString **sep; | |
} MVMIOFileData; | |
/* Closes the file. */ | |
@@ -90,8 +90,10 @@ static MVMint64 tell(MVMThreadContext *tc, MVMOSHandle *h) { | |
/* Set the line separator. */ | |
static void set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) { | |
MVMIOFileData *data = (MVMIOFileData *)h->body.data; | |
- data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep, | |
- MVM_string_graphs(tc, sep) - 1); | |
+ data->sep = MVM_malloc(sizeof(MVMString *) * 3); | |
+ MVM_ASSIGN_REF(tc, &(h->common.header), data->sep[0], sep); | |
+ //~ data->sep[0] = sep; // XXX duplicate | |
+ data->sep[1] = NULL; | |
} | |
/* Read a bunch of bytes into the current decode stream. */ | |
diff --git a/src/io/syncstream.c b/src/io/syncstream.c | |
index 3070105..9be4add 100644 | |
--- a/src/io/syncstream.c | |
+++ b/src/io/syncstream.c | |
@@ -39,10 +39,11 @@ MVMint64 MVM_io_syncstream_tell(MVMThreadContext *tc, MVMOSHandle *h) { | |
/* Set the line separator. */ | |
void MVM_io_syncstream_set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) { | |
- /* For now, take last character. */ | |
MVMIOSyncStreamData *data = (MVMIOSyncStreamData *)h->body.data; | |
- data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep, | |
- MVM_string_graphs(tc, sep) - 1); | |
+ data->sep = MVM_malloc(sizeof(MVMString *) * 3); | |
+ MVM_ASSIGN_REF(tc, &(h->common.header), data->sep[0], sep); | |
+ //~ data->sep[0] = sep; | |
+ data->sep[1] = NULL; | |
} | |
/* Read a bunch of bytes into the current decode stream. Returns true if we | |
@@ -304,7 +305,7 @@ MVMObject * MVM_io_syncstream_from_uvstream(MVMThreadContext *tc, uv_stream_t *h | |
MVMIOSyncStreamData * const data = MVM_calloc(1, sizeof(MVMIOSyncStreamData)); | |
data->handle = handle; | |
data->encoding = MVM_encoding_type_utf8; | |
- data->sep = '\n'; | |
+ //~ data->sep = '\n'; | |
result->body.ops = &op_table; | |
result->body.data = data; | |
return (MVMObject *)result; | |
diff --git a/src/io/syncstream.h b/src/io/syncstream.h | |
index 99d898c..bd02a87 100644 | |
--- a/src/io/syncstream.h | |
+++ b/src/io/syncstream.h | |
@@ -18,8 +18,8 @@ struct MVMIOSyncStreamData { | |
/* Total bytes we've written. */ | |
MVMint64 total_bytes_written; | |
- /* Current separator codepoint. */ | |
- MVMGrapheme32 sep; | |
+ /* Current separator(s). */ | |
+ MVMString **sep; | |
}; | |
void MVM_io_syncstream_set_encoding(MVMThreadContext *tc, MVMOSHandle *h, MVMint64 encoding); | |
diff --git a/src/strings/ascii.c b/src/strings/ascii.c | |
index 5ef3a8f..9545f27 100644 | |
--- a/src/strings/ascii.c | |
+++ b/src/strings/ascii.c | |
@@ -29,10 +29,30 @@ MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, MVMObject *result_t | |
return MVM_string_ascii_decode(tc, result_type, ascii, strlen(ascii)); | |
} | |
+/* This function is used to determine if it is a safe point to stop requesting more data from a stream because | |
+ * we already found a complete separator. */ | |
+static MVMint64 MVM_string_decodestream_find_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 codepoint, MVMString **seplist) { | |
+ //~ ds->partial_sep = "foo"; | |
+ //~ printf("%d\n", strlen(ds->partial_sep)); | |
+ //~ char *sep_remainder_list; | |
+ | |
+ // loop over every seperator in list | |
+ unsigned int found_sep = 0; | |
+ | |
+ if (!seplist) | |
+ return 0; | |
+ | |
+ /* Return success when we found the first complete separator, while keeping | |
+ * tracking of trailing bytes of separators that match what we found so far. */ | |
+ // XXX | |
+ | |
+ return 0; | |
+} | |
+ | |
/* Decodes using a decodestream. Decodes as far as it can with the input | |
* buffers, or until a stopper is reached. */ | |
void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) { | |
+ MVMint32 *stopper_chars, MVMString **stopper_sep) { | |
MVMint32 count = 0, total = 0; | |
MVMint32 bufsize; | |
MVMGrapheme32 *buffer; | |
@@ -77,7 +97,11 @@ void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
total++; | |
if (stopper_chars && *stopper_chars == total) | |
goto done; | |
- if (stopper_sep && *stopper_sep == codepoint) | |
+ //~ if (stopper_sep) { // XXX if our stoppers are \r\n \n \r, and we hit \r, store \n as to-be-removed | |
+ //~ printf("%s:%d old=%d\n", __FILE__, __LINE__, *stopper_sep == codepoint ? 1 : 0); | |
+ //~ printf("%s:%d find=%d\n", __FILE__, __LINE__, MVM_string_decodestream_find_sep(tc, ds, codepoint, stopper_sep)); | |
+ //~ } | |
+ if (stopper_sep && MVM_string_decodestream_find_sep(tc, ds, codepoint, stopper_sep)) | |
goto done; | |
} | |
cur_bytes = cur_bytes->next; | |
diff --git a/src/strings/ascii.h b/src/strings/ascii.h | |
index 9d3bd81..2d49e6f 100644 | |
--- a/src/strings/ascii.h | |
+++ b/src/strings/ascii.h | |
@@ -1,6 +1,6 @@ | |
MVM_PUBLIC MVMString * MVM_string_ascii_decode(MVMThreadContext *tc, MVMObject *result_type, const MVMint8 *ascii, size_t bytes); | |
MVM_PUBLIC MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, MVMObject *result_type, const char *ascii); | |
-MVM_PUBLIC void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep); | |
+MVM_PUBLIC void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep); | |
MVM_PUBLIC MVMuint8 * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length); | |
MVM_PUBLIC MVMuint8 * MVM_string_ascii_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size); | |
MVMuint8 * MVM_string_ascii_encode_any(MVMThreadContext *tc, MVMString *str); | |
diff --git a/src/strings/decode_stream.c b/src/strings/decode_stream.c | |
index 2d97630..6f15092 100644 | |
--- a/src/strings/decode_stream.c | |
+++ b/src/strings/decode_stream.c | |
@@ -72,7 +72,7 @@ void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *d | |
} | |
/* Does a decode run, selected by encoding. */ | |
-static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep) { | |
+static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep) { | |
switch (ds->encoding) { | |
case MVM_encoding_type_utf8: | |
MVM_string_utf8_decodestream(tc, ds, stopper_chars, stopper_sep); | |
@@ -161,30 +161,53 @@ MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStr | |
/* Gets characters up until the specified string is encountered. If we do | |
* not encounter it, returns NULL. This may mean more input buffers are needed | |
* or that we reached the end of the stream. */ | |
-MVMint32 find_separator(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep) { | |
+MVMint32 find_separator(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep) { | |
MVMint32 sep_loc = 0; | |
MVMDecodeStreamChars *cur_chars = ds->chars_head; | |
+ | |
+ if (!stopper_sep) | |
+ return 0; | |
+ | |
while (cur_chars) { | |
MVMint32 start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0; | |
MVMint32 i = 0; | |
for (i = start; i < cur_chars->length; i++) { | |
+ MVMString **seplist = stopper_sep; | |
+ while (*seplist) { | |
+ MVMString *sep = *seplist; | |
+ unsigned int ok = 1; | |
+ MVMint64 end = MVM_string_graphs(tc, sep); | |
+ MVMint64 pos; | |
+ MVMGraphemeIter gi; | |
+ | |
+ MVM_string_gi_init(tc, &gi, sep); | |
+ MVM_string_gi_move_to(tc, &gi, 0); | |
+ /* Return success when we found the first complete separator. */ | |
+ for (pos = 0; ok && pos < end; pos++) { | |
+ MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); | |
+ ok = g == cur_chars->chars[i + pos] ? 1 : 0; | |
+ } | |
+ | |
+ if (ok) | |
+ return sep_loc + pos; | |
+ | |
+ seplist++; | |
+ } | |
sep_loc++; | |
- if (cur_chars->chars[i] == sep) | |
- return sep_loc; | |
} | |
cur_chars = cur_chars->next; | |
} | |
return 0; | |
} | |
-MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep) { | |
+MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep) { | |
MVMint32 sep_loc; | |
/* Look for separator, trying more decoding if it fails. We get the place | |
* just beyond the separator, so can use take_chars to get what's need. */ | |
- sep_loc = find_separator(tc, ds, sep); | |
+ sep_loc = find_separator(tc, ds, stopper_sep); | |
if (!sep_loc) { | |
- run_decode(tc, ds, NULL, &sep); | |
- sep_loc = find_separator(tc, ds, sep); | |
+ run_decode(tc, ds, NULL, stopper_sep); | |
+ sep_loc = find_separator(tc, ds, stopper_sep); | |
} | |
if (sep_loc) | |
return take_chars(tc, ds, sep_loc); | |
diff --git a/src/strings/decode_stream.h b/src/strings/decode_stream.h | |
index 70960e3..918d26f 100644 | |
--- a/src/strings/decode_stream.h | |
+++ b/src/strings/decode_stream.h | |
@@ -19,6 +19,10 @@ struct MVMDecodeStream { | |
/* The encoding we're using. */ | |
MVMint32 encoding; | |
+ | |
+ MVMGrapheme32 **partial_sep; | |
+ | |
+ MVMGrapheme32 **sep_remainder_list; | |
}; | |
/* A single bunch of bytes added to a decode stream, with a link to the next | |
@@ -41,7 +45,7 @@ void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds | |
void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length); | |
void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamBytes *bytes, MVMint32 pos); | |
MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars); | |
-MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep); | |
+MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep); | |
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds); | |
MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 bytes); | |
MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes); | |
diff --git a/src/strings/latin1.c b/src/strings/latin1.c | |
index 3b3a09e..d5a968f 100644 | |
--- a/src/strings/latin1.c | |
+++ b/src/strings/latin1.c | |
@@ -21,7 +21,7 @@ MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, MVMObject *result_typ | |
/* Decodes using a decodestream. Decodes as far as it can with the input | |
* buffers, or until a stopper is reached. */ | |
void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) { | |
+ MVMint32 *stopper_chars, MVMString **stopper_sep) { | |
MVMint32 count = 0, total = 0; | |
MVMint32 bufsize; | |
MVMGrapheme32 *buffer; | |
@@ -63,8 +63,8 @@ void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
total++; | |
if (stopper_chars && *stopper_chars == total) | |
goto done; | |
- if (stopper_sep && *stopper_sep == codepoint) | |
- goto done; | |
+ //~ if (stopper_sep && *stopper_sep == codepoint) // XXX | |
+ //~ goto done; | |
} | |
cur_bytes = cur_bytes->next; | |
} | |
diff --git a/src/strings/latin1.h b/src/strings/latin1.h | |
index 91aa4e0..069e97e 100644 | |
--- a/src/strings/latin1.h | |
+++ b/src/strings/latin1.h | |
@@ -1,3 +1,3 @@ | |
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, MVMObject *result_type, MVMuint8 *latin1, size_t bytes); | |
-MVM_PUBLIC void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep); | |
+MVM_PUBLIC void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep); | |
MVMuint8 * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length); | |
diff --git a/src/strings/ops.c b/src/strings/ops.c | |
index be0e701..9be3e96 100644 | |
--- a/src/strings/ops.c | |
+++ b/src/strings/ops.c | |
@@ -24,6 +24,7 @@ static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) { | |
/* Allocates strand storage. */ | |
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) { | |
+ //~ printf("%d * %d\n", (int)num_strands, (int)sizeof(MVMStringStrand)); | |
return MVM_malloc(num_strands * sizeof(MVMStringStrand)); | |
} | |
diff --git a/src/strings/utf8.c b/src/strings/utf8.c | |
index 7a0063d..fe9de78 100644 | |
--- a/src/strings/utf8.c | |
+++ b/src/strings/utf8.c | |
@@ -267,7 +267,7 @@ MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, MVMObject *result_type, | |
/* Decodes using a decodestream. Decodes as far as it can with the input | |
* buffers, or until a stopper is reached. */ | |
void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) { | |
+ MVMint32 *stopper_chars, MVMString **stopper_sep) { | |
MVMint32 count = 0, total = 0, stopped = 0; | |
MVMint32 state = 0; | |
MVMCodepoint codepoint = 0; | |
@@ -313,8 +313,8 @@ void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, | |
total++; | |
if (stopper_chars && *stopper_chars == total) | |
goto done; | |
- if (stopper_sep && *stopper_sep == codepoint) | |
- goto done; | |
+ //~ if (stopper_sep && *stopper_sep == codepoint) // XXX | |
+ //~ goto done; | |
break; | |
case UTF8_REJECT: | |
MVM_exception_throw_adhoc(tc, "Malformed UTF-8"); | |
diff --git a/src/strings/utf8.h b/src/strings/utf8.h | |
index fce37c6..307176a 100644 | |
--- a/src/strings/utf8.h | |
+++ b/src/strings/utf8.h | |
@@ -1,5 +1,5 @@ | |
MVM_PUBLIC MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, MVMObject *result_type, const MVMuint8 *utf8, size_t bytes); | |
-MVM_PUBLIC void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep); | |
+MVM_PUBLIC void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep); | |
MVM_PUBLIC MVMuint8 * MVM_string_utf8_encode_substr(MVMThreadContext *tc, | |
MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length); | |
MVM_PUBLIC MVMuint8 * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment