Skip to content

Instantly share code, notes, and snippets.

@FROGGS
Created October 15, 2014 10:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FROGGS/923b97f208ddb4fef181 to your computer and use it in GitHub Desktop.
Save FROGGS/923b97f208ddb4fef181 to your computer and use it in GitHub Desktop.
diff --git a/src/io/syncfile.c b/src/io/syncfile.c
index d951a73..6342dae 100644
--- a/src/io/syncfile.c
+++ b/src/io/syncfile.c
@@ -35,8 +35,8 @@ typedef struct {
/* Decode stream, for turning bytes from disk into strings. */
MVMDecodeStream *ds;
- /* Current separator codepoint. */
- MVMGrapheme32 sep;
+ /* Current separator(s). */
+ MVMString **sep;
} MVMIOFileData;
/* Closes the file. */
@@ -90,8 +90,10 @@ static MVMint64 tell(MVMThreadContext *tc, MVMOSHandle *h) {
/* Set the line separator. */
static void set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) {
MVMIOFileData *data = (MVMIOFileData *)h->body.data;
- data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep,
- MVM_string_graphs(tc, sep) - 1);
+ data->sep = MVM_malloc(sizeof(MVMString *) * 3);
+ MVM_ASSIGN_REF(tc, &(h->common.header), data->sep[0], sep);
+ //~ data->sep[0] = sep; // XXX duplicate
+ data->sep[1] = NULL;
}
/* Read a bunch of bytes into the current decode stream. */
diff --git a/src/io/syncstream.c b/src/io/syncstream.c
index 3070105..9be4add 100644
--- a/src/io/syncstream.c
+++ b/src/io/syncstream.c
@@ -39,10 +39,11 @@ MVMint64 MVM_io_syncstream_tell(MVMThreadContext *tc, MVMOSHandle *h) {
/* Set the line separator. */
void MVM_io_syncstream_set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) {
- /* For now, take last character. */
MVMIOSyncStreamData *data = (MVMIOSyncStreamData *)h->body.data;
- data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep,
- MVM_string_graphs(tc, sep) - 1);
+ data->sep = MVM_malloc(sizeof(MVMString *) * 3);
+ MVM_ASSIGN_REF(tc, &(h->common.header), data->sep[0], sep);
+ //~ data->sep[0] = sep;
+ data->sep[1] = NULL;
}
/* Read a bunch of bytes into the current decode stream. Returns true if we
@@ -304,7 +305,7 @@ MVMObject * MVM_io_syncstream_from_uvstream(MVMThreadContext *tc, uv_stream_t *h
MVMIOSyncStreamData * const data = MVM_calloc(1, sizeof(MVMIOSyncStreamData));
data->handle = handle;
data->encoding = MVM_encoding_type_utf8;
- data->sep = '\n';
+ //~ data->sep = '\n';
result->body.ops = &op_table;
result->body.data = data;
return (MVMObject *)result;
diff --git a/src/io/syncstream.h b/src/io/syncstream.h
index 99d898c..bd02a87 100644
--- a/src/io/syncstream.h
+++ b/src/io/syncstream.h
@@ -18,8 +18,8 @@ struct MVMIOSyncStreamData {
/* Total bytes we've written. */
MVMint64 total_bytes_written;
- /* Current separator codepoint. */
- MVMGrapheme32 sep;
+ /* Current separator(s). */
+ MVMString **sep;
};
void MVM_io_syncstream_set_encoding(MVMThreadContext *tc, MVMOSHandle *h, MVMint64 encoding);
diff --git a/src/strings/ascii.c b/src/strings/ascii.c
index 5ef3a8f..9545f27 100644
--- a/src/strings/ascii.c
+++ b/src/strings/ascii.c
@@ -29,10 +29,30 @@ MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, MVMObject *result_t
return MVM_string_ascii_decode(tc, result_type, ascii, strlen(ascii));
}
+/* This function is used to determine if it is a safe point to stop requesting more data from a stream because
+ * we already found a complete separator. */
+static MVMint64 MVM_string_decodestream_find_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 codepoint, MVMString **seplist) {
+ //~ ds->partial_sep = "foo";
+ //~ printf("%d\n", strlen(ds->partial_sep));
+ //~ char *sep_remainder_list;
+
+ // loop over every seperator in list
+ unsigned int found_sep = 0;
+
+ if (!seplist)
+ return 0;
+
+ /* Return success when we found the first complete separator, while keeping
+ * tracking of trailing bytes of separators that match what we found so far. */
+ // XXX
+
+ return 0;
+}
+
/* Decodes using a decodestream. Decodes as far as it can with the input
* buffers, or until a stopper is reached. */
void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) {
+ MVMint32 *stopper_chars, MVMString **stopper_sep) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer;
@@ -77,7 +97,11 @@ void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
total++;
if (stopper_chars && *stopper_chars == total)
goto done;
- if (stopper_sep && *stopper_sep == codepoint)
+ //~ if (stopper_sep) { // XXX if our stoppers are \r\n \n \r, and we hit \r, store \n as to-be-removed
+ //~ printf("%s:%d old=%d\n", __FILE__, __LINE__, *stopper_sep == codepoint ? 1 : 0);
+ //~ printf("%s:%d find=%d\n", __FILE__, __LINE__, MVM_string_decodestream_find_sep(tc, ds, codepoint, stopper_sep));
+ //~ }
+ if (stopper_sep && MVM_string_decodestream_find_sep(tc, ds, codepoint, stopper_sep))
goto done;
}
cur_bytes = cur_bytes->next;
diff --git a/src/strings/ascii.h b/src/strings/ascii.h
index 9d3bd81..2d49e6f 100644
--- a/src/strings/ascii.h
+++ b/src/strings/ascii.h
@@ -1,6 +1,6 @@
MVM_PUBLIC MVMString * MVM_string_ascii_decode(MVMThreadContext *tc, MVMObject *result_type, const MVMint8 *ascii, size_t bytes);
MVM_PUBLIC MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, MVMObject *result_type, const char *ascii);
-MVM_PUBLIC void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep);
+MVM_PUBLIC void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep);
MVM_PUBLIC MVMuint8 * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length);
MVM_PUBLIC MVMuint8 * MVM_string_ascii_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size);
MVMuint8 * MVM_string_ascii_encode_any(MVMThreadContext *tc, MVMString *str);
diff --git a/src/strings/decode_stream.c b/src/strings/decode_stream.c
index 2d97630..6f15092 100644
--- a/src/strings/decode_stream.c
+++ b/src/strings/decode_stream.c
@@ -72,7 +72,7 @@ void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *d
}
/* Does a decode run, selected by encoding. */
-static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep) {
+static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep) {
switch (ds->encoding) {
case MVM_encoding_type_utf8:
MVM_string_utf8_decodestream(tc, ds, stopper_chars, stopper_sep);
@@ -161,30 +161,53 @@ MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStr
/* Gets characters up until the specified string is encountered. If we do
* not encounter it, returns NULL. This may mean more input buffers are needed
* or that we reached the end of the stream. */
-MVMint32 find_separator(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep) {
+MVMint32 find_separator(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep) {
MVMint32 sep_loc = 0;
MVMDecodeStreamChars *cur_chars = ds->chars_head;
+
+ if (!stopper_sep)
+ return 0;
+
while (cur_chars) {
MVMint32 start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0;
MVMint32 i = 0;
for (i = start; i < cur_chars->length; i++) {
+ MVMString **seplist = stopper_sep;
+ while (*seplist) {
+ MVMString *sep = *seplist;
+ unsigned int ok = 1;
+ MVMint64 end = MVM_string_graphs(tc, sep);
+ MVMint64 pos;
+ MVMGraphemeIter gi;
+
+ MVM_string_gi_init(tc, &gi, sep);
+ MVM_string_gi_move_to(tc, &gi, 0);
+ /* Return success when we found the first complete separator. */
+ for (pos = 0; ok && pos < end; pos++) {
+ MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
+ ok = g == cur_chars->chars[i + pos] ? 1 : 0;
+ }
+
+ if (ok)
+ return sep_loc + pos;
+
+ seplist++;
+ }
sep_loc++;
- if (cur_chars->chars[i] == sep)
- return sep_loc;
}
cur_chars = cur_chars->next;
}
return 0;
}
-MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep) {
+MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep) {
MVMint32 sep_loc;
/* Look for separator, trying more decoding if it fails. We get the place
* just beyond the separator, so can use take_chars to get what's need. */
- sep_loc = find_separator(tc, ds, sep);
+ sep_loc = find_separator(tc, ds, stopper_sep);
if (!sep_loc) {
- run_decode(tc, ds, NULL, &sep);
- sep_loc = find_separator(tc, ds, sep);
+ run_decode(tc, ds, NULL, stopper_sep);
+ sep_loc = find_separator(tc, ds, stopper_sep);
}
if (sep_loc)
return take_chars(tc, ds, sep_loc);
diff --git a/src/strings/decode_stream.h b/src/strings/decode_stream.h
index 70960e3..918d26f 100644
--- a/src/strings/decode_stream.h
+++ b/src/strings/decode_stream.h
@@ -19,6 +19,10 @@ struct MVMDecodeStream {
/* The encoding we're using. */
MVMint32 encoding;
+
+ MVMGrapheme32 **partial_sep;
+
+ MVMGrapheme32 **sep_remainder_list;
};
/* A single bunch of bytes added to a decode stream, with a link to the next
@@ -41,7 +45,7 @@ void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds
void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length);
void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamBytes *bytes, MVMint32 pos);
MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars);
-MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep);
+MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMString **stopper_sep);
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds);
MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 bytes);
MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes);
diff --git a/src/strings/latin1.c b/src/strings/latin1.c
index 3b3a09e..d5a968f 100644
--- a/src/strings/latin1.c
+++ b/src/strings/latin1.c
@@ -21,7 +21,7 @@ MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, MVMObject *result_typ
/* Decodes using a decodestream. Decodes as far as it can with the input
* buffers, or until a stopper is reached. */
void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) {
+ MVMint32 *stopper_chars, MVMString **stopper_sep) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer;
@@ -63,8 +63,8 @@ void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
total++;
if (stopper_chars && *stopper_chars == total)
goto done;
- if (stopper_sep && *stopper_sep == codepoint)
- goto done;
+ //~ if (stopper_sep && *stopper_sep == codepoint) // XXX
+ //~ goto done;
}
cur_bytes = cur_bytes->next;
}
diff --git a/src/strings/latin1.h b/src/strings/latin1.h
index 91aa4e0..069e97e 100644
--- a/src/strings/latin1.h
+++ b/src/strings/latin1.h
@@ -1,3 +1,3 @@
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, MVMObject *result_type, MVMuint8 *latin1, size_t bytes);
-MVM_PUBLIC void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep);
+MVM_PUBLIC void MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep);
MVMuint8 * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length);
diff --git a/src/strings/ops.c b/src/strings/ops.c
index be0e701..9be3e96 100644
--- a/src/strings/ops.c
+++ b/src/strings/ops.c
@@ -24,6 +24,7 @@ static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) {
/* Allocates strand storage. */
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) {
+ //~ printf("%d * %d\n", (int)num_strands, (int)sizeof(MVMStringStrand));
return MVM_malloc(num_strands * sizeof(MVMStringStrand));
}
diff --git a/src/strings/utf8.c b/src/strings/utf8.c
index 7a0063d..fe9de78 100644
--- a/src/strings/utf8.c
+++ b/src/strings/utf8.c
@@ -267,7 +267,7 @@ MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, MVMObject *result_type,
/* Decodes using a decodestream. Decodes as far as it can with the input
* buffers, or until a stopper is reached. */
void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
- MVMint32 *stopper_chars, MVMint32 *stopper_sep) {
+ MVMint32 *stopper_chars, MVMString **stopper_sep) {
MVMint32 count = 0, total = 0, stopped = 0;
MVMint32 state = 0;
MVMCodepoint codepoint = 0;
@@ -313,8 +313,8 @@ void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
total++;
if (stopper_chars && *stopper_chars == total)
goto done;
- if (stopper_sep && *stopper_sep == codepoint)
- goto done;
+ //~ if (stopper_sep && *stopper_sep == codepoint) // XXX
+ //~ goto done;
break;
case UTF8_REJECT:
MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
diff --git a/src/strings/utf8.h b/src/strings/utf8.h
index fce37c6..307176a 100644
--- a/src/strings/utf8.h
+++ b/src/strings/utf8.h
@@ -1,5 +1,5 @@
MVM_PUBLIC MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, MVMObject *result_type, const MVMuint8 *utf8, size_t bytes);
-MVM_PUBLIC void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep);
+MVM_PUBLIC void MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMString **stopper_sep);
MVM_PUBLIC MVMuint8 * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length);
MVM_PUBLIC MVMuint8 * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment