Last active
November 15, 2022 16:01
-
-
Save Guiorgy/649425c266dc5c6be1144c9bfb71f0dc to your computer and use it in GitHub Desktop.
Search occurrences of specified tokens (character sequences surrounded by specified prefix and postfix delimiters) in a string and replace them with other specified strings faster and with less memory usage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static class StringExtensionMethods | |
{ | |
/** | |
* Returns a new string in which all occurrences of specified tokens (character sequences surrounded | |
* by the specified prefix and postfix delimiters) in the current string are replaced with other specified strings. | |
* | |
* All tokens must start and end with the same prefix and postfix delimiters. | |
* `toReplace` strings in the batch should not have those delimiters explicitly included. | |
*/ | |
public static string ReplaceBatch(this string origin, char prefix, char postfix, params (string toReplace, string replaceWith)[] batch) | |
{ | |
if (batch.Length == 0) return origin; | |
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace + postfix, batch[0].replaceWith); | |
ReadOnlySpan<char> tmp = origin; | |
var occurrences = new Queue<(int at, int task)>(); | |
int offset = 0; | |
int resultLength = tmp.Length; | |
int prefixIndex; | |
while ((prefixIndex = tmp.IndexOf(prefix)) != -1) | |
{ | |
(int at, int task) next = (prefixIndex, -1); | |
for (int i = 0; i < batch.Length; i++) | |
{ | |
int postfixIndex = prefixIndex + batch[i].toReplace.Length + 1; | |
if (tmp.Length > postfixIndex | |
&& tmp[postfixIndex] == postfix | |
&& tmp.Slice(prefixIndex + 1, batch[i].toReplace.Length).SequenceEqual(batch[i].toReplace)) | |
{ | |
next.task = i; | |
break; | |
} | |
} | |
if (next.task == -1) | |
{ | |
tmp = tmp.Slice(prefixIndex + 1); | |
offset += prefixIndex + 1; | |
continue; | |
} | |
occurrences.Enqueue((next.at + offset, next.task)); | |
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length - 2; | |
int newStart = next.at + batch[next.task].toReplace.Length + 2; | |
tmp = tmp.Slice(newStart); | |
offset += newStart; | |
} | |
if (occurrences.Count == 0) return origin; | |
return string.Create(resultLength, (batch, occurrences), (chars, state) => | |
{ | |
var replaceTasks = state.batch; | |
var occurrences = state.occurrences; | |
var position = 0; | |
ReadOnlySpan<char> o = origin; | |
int lastStart = 0; | |
while (occurrences.Count != 0) | |
{ | |
(int at, int task) next = occurrences.Dequeue(); | |
o.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position)); | |
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart)); | |
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length; | |
lastStart = next.at + replaceTasks[next.task].toReplace.Length + 2; | |
} | |
o.Slice(lastStart).CopyTo(chars.Slice(position)); | |
}); | |
} | |
/** | |
* Returns a new string in which all occurrences of specified tokens (character sequences prefixed | |
* by the specified prefix delimiter) in the current string are replaced with other specified strings. | |
* | |
* All tokens must start with the same prefix delimiter. | |
* `toReplace` strings in the batch should not have that delimiter explicitly included. | |
*/ | |
public static string ReplaceBatch(this string origin, char prefix, params (string toReplace, string replaceWith)[] batch) | |
{ | |
if (batch.Length == 0) return origin; | |
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace, batch[0].replaceWith); | |
ReadOnlySpan<char> tmp = origin; | |
var occurrences = new Queue<(int at, int task)>(); | |
int offset = 0; | |
int resultLength = tmp.Length; | |
int prefixIndex; | |
while ((prefixIndex = tmp.IndexOf(prefix)) != -1) | |
{ | |
(int at, int task) next = (prefixIndex, -1); | |
for (int i = 0; i < batch.Length; i++) | |
{ | |
if (tmp.Length > prefixIndex + batch[i].toReplace.Length | |
&& tmp.Slice(prefixIndex + 1, batch[i].toReplace.Length).SequenceEqual(batch[i].toReplace)) | |
{ | |
next.task = i; | |
break; | |
} | |
} | |
if (next.task == -1) | |
{ | |
tmp = tmp.Slice(prefixIndex + 1); | |
offset += prefixIndex + 1; | |
continue; | |
} | |
occurrences.Enqueue((next.at + offset, next.task)); | |
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length - 1; | |
int newStart = next.at + batch[next.task].toReplace.Length + 1; | |
tmp = tmp.Slice(newStart); | |
offset += newStart; | |
} | |
if (occurrences.Count == 0) return origin; | |
return string.Create(resultLength, (batch, occurrences), (chars, state) => | |
{ | |
var replaceTasks = state.batch; | |
var occurrences = state.occurrences; | |
var position = 0; | |
ReadOnlySpan<char> o = origin; | |
int lastStart = 0; | |
while (occurrences.Count != 0) | |
{ | |
(int at, int task) next = occurrences.Dequeue(); | |
o.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position)); | |
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart)); | |
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length; | |
lastStart = next.at + replaceTasks[next.task].toReplace.Length + 1; | |
} | |
o.Slice(lastStart).CopyTo(chars.Slice(position)); | |
}); | |
} | |
/** | |
* Returns a new string in which all occurrences of specified tokens (character sequences surrounded | |
* by the specified prefix and postfix delimiters) in the current string are replaced with other specified strings. | |
* | |
* All tokens must start and end with the same prefix and postfix delimiters. | |
* `toReplace` strings in the batch should not have those delimiters explicitly included. | |
*/ | |
public static string ReplaceBatch(this string origin, char prefix, char postfix, int preallocate, params (string toReplace, string replaceWith)[] batch) | |
{ | |
if (batch.Length == 0) return origin; | |
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace + postfix, batch[0].replaceWith); | |
if (preallocate < 0) preallocate = origin.Length * 2; | |
int resultLength = -1; | |
string result = string.Create(preallocate, (origin, prefix, postfix, batch), (chars, state) => | |
{ | |
var replaceTasks = state.batch; | |
char prefix = state.prefix; | |
char postfix = state.postfix; | |
var position = 0; | |
ReadOnlySpan<char> tmp = state.origin; | |
int prefixIndex; | |
while ((prefixIndex = tmp.IndexOf(prefix)) != -1) | |
{ | |
bool replaced = false; | |
for (int i = 0; i < replaceTasks.Length; i++) | |
{ | |
int postfixIndex = prefixIndex + replaceTasks[i].toReplace.Length + 1; | |
if (tmp.Length > postfixIndex | |
&& tmp[postfixIndex] == postfix | |
&& tmp.Slice(prefixIndex + 1, replaceTasks[i].toReplace.Length).SequenceEqual(replaceTasks[i].toReplace)) | |
{ | |
if (position + prefixIndex + replaceTasks[i].replaceWith.Length > chars.Length) return; | |
tmp.Slice(0, prefixIndex).CopyTo(chars.Slice(position)); | |
replaceTasks[i].replaceWith.CopyTo(chars.Slice(position + prefixIndex)); | |
position += prefixIndex + replaceTasks[i].replaceWith.Length; | |
tmp = tmp.Slice(postfixIndex + 1); | |
replaced = true; | |
break; | |
} | |
} | |
if (!replaced) | |
{ | |
tmp.Slice(0, prefixIndex + 1).CopyTo(chars.Slice(position)); | |
position += prefixIndex + 1; | |
tmp = tmp.Slice(prefixIndex + 1); | |
} | |
} | |
if (position + tmp.Length <= chars.Length) | |
{ | |
tmp.CopyTo(chars.Slice(position)); | |
resultLength = position + tmp.Length; | |
} | |
}); | |
if (resultLength != -1) | |
return result.Length == resultLength ? result : result.Substring(0, resultLength); | |
return ReplaceBatch(origin, prefix, postfix, batch); | |
} | |
/** | |
* Returns a new string in which all occurrences of specified tokens (character sequences prefixed | |
* by the specified prefix delimiter) in the current string are replaced with other specified strings. | |
* | |
* All tokens must start with the same prefix delimiter. | |
* `toReplace` strings in the batch should not have that delimiter explicitly included. | |
*/ | |
public static string ReplaceBatch(this string origin, char prefix, int preallocate, params (string toReplace, string replaceWith)[] batch) | |
{ | |
if (batch.Length == 0) return origin; | |
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace, batch[0].replaceWith); | |
if (preallocate < 0) preallocate = origin.Length * 2; | |
int resultLength = -1; | |
string result = string.Create(preallocate, (origin, prefix, batch), (chars, state) => | |
{ | |
var replaceTasks = state.batch; | |
char prefix = state.prefix; | |
var position = 0; | |
ReadOnlySpan<char> tmp = state.origin; | |
int prefixIndex; | |
while ((prefixIndex = tmp.IndexOf(prefix)) != -1) | |
{ | |
bool replaced = false; | |
for (int i = 0; i < replaceTasks.Length; i++) | |
{ | |
if (tmp.Length > prefixIndex + replaceTasks[i].toReplace.Length | |
&& tmp.Slice(prefixIndex + 1, replaceTasks[i].toReplace.Length).SequenceEqual(replaceTasks[i].toReplace)) | |
{ | |
if (position + prefixIndex + replaceTasks[i].replaceWith.Length > chars.Length) return; | |
tmp.Slice(0, prefixIndex).CopyTo(chars.Slice(position)); | |
replaceTasks[i].replaceWith.CopyTo(chars.Slice(position + prefixIndex)); | |
position += prefixIndex + replaceTasks[i].replaceWith.Length; | |
tmp = tmp.Slice(prefixIndex + replaceTasks[i].toReplace.Length + 1); | |
replaced = true; | |
break; | |
} | |
} | |
if (!replaced) | |
{ | |
tmp.Slice(0, prefixIndex + 1).CopyTo(chars.Slice(position)); | |
position += prefixIndex + 1; | |
tmp = tmp.Slice(prefixIndex + 1); | |
} | |
} | |
if (position + tmp.Length <= chars.Length) | |
{ | |
tmp.CopyTo(chars.Slice(position)); | |
resultLength = position + tmp.Length; | |
} | |
}); | |
if (resultLength != -1) | |
return result.Length == resultLength ? result : result.Substring(0, resultLength); | |
return ReplaceBatch(origin, prefix, batch); | |
} | |
/** | |
* Returns a new string in which all occurrences of specified strings in the current | |
* string are replaced with other specified strings. | |
*/ | |
public static string ReplaceBatch(this string origin, params (string toReplace, string replaceWith)[] batch) | |
{ | |
if (batch.Length == 0) return origin; | |
else if (batch.Length == 1) return origin.Replace(batch[0].toReplace, batch[0].replaceWith); | |
char? prefix = batch[0].toReplace[0]; | |
for (int i = 1; i < batch.Length; i++) | |
{ | |
if (batch[i].toReplace[0] != prefix) | |
{ | |
prefix = null; | |
break; | |
} | |
} | |
if (prefix != null) | |
{ | |
var noPrefixBatch = new (string toReplace, string replaceWith)[batch.Length]; | |
for (int i = 0; i < batch.Length; i++) noPrefixBatch[i] = (batch[i].toReplace[1..], batch[i].replaceWith); | |
return ReplaceBatch(origin, (char)prefix, noPrefixBatch); | |
} | |
ReadOnlySpan<char> tmp = origin; | |
var occurrences = new Queue<(int at, int task)>(); | |
int offset = 0; | |
int resultLength = tmp.Length; | |
int taskCount = batch.Length; | |
Span<(int at, int task)> nexts = stackalloc (int at, int task)[taskCount]; | |
(int at, int task) next = (tmp.Length, -1); | |
int tc = taskCount; | |
for (int i = 0, j = 0; j < tc; j++) | |
{ | |
nexts[i] = (tmp.IndexOf(batch[j].toReplace), j); | |
if (nexts[i].at == -1) | |
{ | |
taskCount--; | |
continue; | |
} | |
if (nexts[i].at < next.at) next = nexts[i]; | |
i++; | |
} | |
if (next.task == -1) return origin; | |
while (next.task != -1) | |
{ | |
occurrences.Enqueue((next.at + offset, next.task)); | |
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length; | |
int newStart = next.at + batch[next.task].toReplace.Length; | |
tmp = tmp.Slice(newStart); | |
offset += newStart; | |
next = (tmp.Length, -1); | |
tc = taskCount; | |
for (int i = 0, j = 0; j < tc; j++) | |
{ | |
nexts[j].at -= newStart; | |
if (nexts[j].at < 0) | |
nexts[j].at = tmp.IndexOf(batch[nexts[j].task].toReplace); | |
if (nexts[j].at == -1) | |
{ | |
taskCount--; | |
continue; | |
} | |
if (nexts[j].at < next.at) next = nexts[j]; | |
if (i != j) nexts[i] = nexts[j]; | |
i++; | |
} | |
} | |
return string.Create(resultLength, (origin, batch, occurrences), (chars, state) => | |
{ | |
var replaceTasks = state.batch; | |
var occurrences = state.occurrences; | |
var position = 0; | |
ReadOnlySpan<char> origin = state.origin; | |
int lastStart = 0; | |
while (occurrences.Count != 0) | |
{ | |
(int at, int task) next = occurrences.Dequeue(); | |
origin.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position)); | |
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart)); | |
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length; | |
lastStart = next.at + replaceTasks[next.task].toReplace.Length; | |
} | |
origin.Slice(lastStart).CopyTo(chars.Slice(position)); | |
}); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example benchmarks:
Where
Normal
is defined as:And the test data is:
Note: In the preallocation variant, at the end we need to trim excess null (
\0
) characters. At the moment this is done using thesubstring
method, however this is an extra string allocation that we would ideally want to avoid. My attempts at doing so in-place:Both failed to run with Benchmarks.Net with the GC error:
If anybody has a better idea, do let be know.
Edit:
Added a more general variant that doesn't require delimiters (prefix and postfix). The performance seems to be about the same as the Normal method at best and sometimes quite a bit slower, but it uses far less memory, which might be useful to some. Might be able to improve performance with a proper multi pattern search algorythm though.
Edit2:
Modified the generic method to detect when all patterns start with the same prefix and use the faster method instead.