Skip to content

Instantly share code, notes, and snippets.

@Guiorgy
Last active November 15, 2022 16:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Guiorgy/649425c266dc5c6be1144c9bfb71f0dc to your computer and use it in GitHub Desktop.
Save Guiorgy/649425c266dc5c6be1144c9bfb71f0dc to your computer and use it in GitHub Desktop.
Search occurrences of specified tokens (character sequences surrounded by specified prefix and postfix delimiters) in a string and replace them with other specified strings faster and with less memory usage
public static class StringExtensionMethods
{
/**
* Returns a new string in which all occurrences of specified tokens (character sequences surrounded
* by the specified prefix and postfix delimiters) in the current string are replaced with other specified strings.
*
* All tokens must start and end with the same prefix and postfix delimiters.
* `toReplace` strings in the batch should not have those delimiters explicitly included.
*/
public static string ReplaceBatch(this string origin, char prefix, char postfix, params (string toReplace, string replaceWith)[] batch)
{
if (batch.Length == 0) return origin;
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace + postfix, batch[0].replaceWith);
ReadOnlySpan<char> tmp = origin;
var occurrences = new Queue<(int at, int task)>();
int offset = 0;
int resultLength = tmp.Length;
int prefixIndex;
while ((prefixIndex = tmp.IndexOf(prefix)) != -1)
{
(int at, int task) next = (prefixIndex, -1);
for (int i = 0; i < batch.Length; i++)
{
int postfixIndex = prefixIndex + batch[i].toReplace.Length + 1;
if (tmp.Length > postfixIndex
&& tmp[postfixIndex] == postfix
&& tmp.Slice(prefixIndex + 1, batch[i].toReplace.Length).SequenceEqual(batch[i].toReplace))
{
next.task = i;
break;
}
}
if (next.task == -1)
{
tmp = tmp.Slice(prefixIndex + 1);
offset += prefixIndex + 1;
continue;
}
occurrences.Enqueue((next.at + offset, next.task));
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length - 2;
int newStart = next.at + batch[next.task].toReplace.Length + 2;
tmp = tmp.Slice(newStart);
offset += newStart;
}
if (occurrences.Count == 0) return origin;
return string.Create(resultLength, (batch, occurrences), (chars, state) =>
{
var replaceTasks = state.batch;
var occurrences = state.occurrences;
var position = 0;
ReadOnlySpan<char> o = origin;
int lastStart = 0;
while (occurrences.Count != 0)
{
(int at, int task) next = occurrences.Dequeue();
o.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position));
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart));
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length;
lastStart = next.at + replaceTasks[next.task].toReplace.Length + 2;
}
o.Slice(lastStart).CopyTo(chars.Slice(position));
});
}
/**
* Returns a new string in which all occurrences of specified tokens (character sequences prefixed
* by the specified prefix delimiter) in the current string are replaced with other specified strings.
*
* All tokens must start with the same prefix delimiter.
* `toReplace` strings in the batch should not have that delimiter explicitly included.
*/
public static string ReplaceBatch(this string origin, char prefix, params (string toReplace, string replaceWith)[] batch)
{
if (batch.Length == 0) return origin;
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace, batch[0].replaceWith);
ReadOnlySpan<char> tmp = origin;
var occurrences = new Queue<(int at, int task)>();
int offset = 0;
int resultLength = tmp.Length;
int prefixIndex;
while ((prefixIndex = tmp.IndexOf(prefix)) != -1)
{
(int at, int task) next = (prefixIndex, -1);
for (int i = 0; i < batch.Length; i++)
{
if (tmp.Length > prefixIndex + batch[i].toReplace.Length
&& tmp.Slice(prefixIndex + 1, batch[i].toReplace.Length).SequenceEqual(batch[i].toReplace))
{
next.task = i;
break;
}
}
if (next.task == -1)
{
tmp = tmp.Slice(prefixIndex + 1);
offset += prefixIndex + 1;
continue;
}
occurrences.Enqueue((next.at + offset, next.task));
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length - 1;
int newStart = next.at + batch[next.task].toReplace.Length + 1;
tmp = tmp.Slice(newStart);
offset += newStart;
}
if (occurrences.Count == 0) return origin;
return string.Create(resultLength, (batch, occurrences), (chars, state) =>
{
var replaceTasks = state.batch;
var occurrences = state.occurrences;
var position = 0;
ReadOnlySpan<char> o = origin;
int lastStart = 0;
while (occurrences.Count != 0)
{
(int at, int task) next = occurrences.Dequeue();
o.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position));
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart));
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length;
lastStart = next.at + replaceTasks[next.task].toReplace.Length + 1;
}
o.Slice(lastStart).CopyTo(chars.Slice(position));
});
}
/**
* Returns a new string in which all occurrences of specified tokens (character sequences surrounded
* by the specified prefix and postfix delimiters) in the current string are replaced with other specified strings.
*
* All tokens must start and end with the same prefix and postfix delimiters.
* `toReplace` strings in the batch should not have those delimiters explicitly included.
*/
public static string ReplaceBatch(this string origin, char prefix, char postfix, int preallocate, params (string toReplace, string replaceWith)[] batch)
{
if (batch.Length == 0) return origin;
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace + postfix, batch[0].replaceWith);
if (preallocate < 0) preallocate = origin.Length * 2;
int resultLength = -1;
string result = string.Create(preallocate, (origin, prefix, postfix, batch), (chars, state) =>
{
var replaceTasks = state.batch;
char prefix = state.prefix;
char postfix = state.postfix;
var position = 0;
ReadOnlySpan<char> tmp = state.origin;
int prefixIndex;
while ((prefixIndex = tmp.IndexOf(prefix)) != -1)
{
bool replaced = false;
for (int i = 0; i < replaceTasks.Length; i++)
{
int postfixIndex = prefixIndex + replaceTasks[i].toReplace.Length + 1;
if (tmp.Length > postfixIndex
&& tmp[postfixIndex] == postfix
&& tmp.Slice(prefixIndex + 1, replaceTasks[i].toReplace.Length).SequenceEqual(replaceTasks[i].toReplace))
{
if (position + prefixIndex + replaceTasks[i].replaceWith.Length > chars.Length) return;
tmp.Slice(0, prefixIndex).CopyTo(chars.Slice(position));
replaceTasks[i].replaceWith.CopyTo(chars.Slice(position + prefixIndex));
position += prefixIndex + replaceTasks[i].replaceWith.Length;
tmp = tmp.Slice(postfixIndex + 1);
replaced = true;
break;
}
}
if (!replaced)
{
tmp.Slice(0, prefixIndex + 1).CopyTo(chars.Slice(position));
position += prefixIndex + 1;
tmp = tmp.Slice(prefixIndex + 1);
}
}
if (position + tmp.Length <= chars.Length)
{
tmp.CopyTo(chars.Slice(position));
resultLength = position + tmp.Length;
}
});
if (resultLength != -1)
return result.Length == resultLength ? result : result.Substring(0, resultLength);
return ReplaceBatch(origin, prefix, postfix, batch);
}
/**
* Returns a new string in which all occurrences of specified tokens (character sequences prefixed
* by the specified prefix delimiter) in the current string are replaced with other specified strings.
*
* All tokens must start with the same prefix delimiter.
* `toReplace` strings in the batch should not have that delimiter explicitly included.
*/
public static string ReplaceBatch(this string origin, char prefix, int preallocate, params (string toReplace, string replaceWith)[] batch)
{
if (batch.Length == 0) return origin;
else if (batch.Length == 1) return origin.Replace(prefix + batch[0].toReplace, batch[0].replaceWith);
if (preallocate < 0) preallocate = origin.Length * 2;
int resultLength = -1;
string result = string.Create(preallocate, (origin, prefix, batch), (chars, state) =>
{
var replaceTasks = state.batch;
char prefix = state.prefix;
var position = 0;
ReadOnlySpan<char> tmp = state.origin;
int prefixIndex;
while ((prefixIndex = tmp.IndexOf(prefix)) != -1)
{
bool replaced = false;
for (int i = 0; i < replaceTasks.Length; i++)
{
if (tmp.Length > prefixIndex + replaceTasks[i].toReplace.Length
&& tmp.Slice(prefixIndex + 1, replaceTasks[i].toReplace.Length).SequenceEqual(replaceTasks[i].toReplace))
{
if (position + prefixIndex + replaceTasks[i].replaceWith.Length > chars.Length) return;
tmp.Slice(0, prefixIndex).CopyTo(chars.Slice(position));
replaceTasks[i].replaceWith.CopyTo(chars.Slice(position + prefixIndex));
position += prefixIndex + replaceTasks[i].replaceWith.Length;
tmp = tmp.Slice(prefixIndex + replaceTasks[i].toReplace.Length + 1);
replaced = true;
break;
}
}
if (!replaced)
{
tmp.Slice(0, prefixIndex + 1).CopyTo(chars.Slice(position));
position += prefixIndex + 1;
tmp = tmp.Slice(prefixIndex + 1);
}
}
if (position + tmp.Length <= chars.Length)
{
tmp.CopyTo(chars.Slice(position));
resultLength = position + tmp.Length;
}
});
if (resultLength != -1)
return result.Length == resultLength ? result : result.Substring(0, resultLength);
return ReplaceBatch(origin, prefix, batch);
}
/**
* Returns a new string in which all occurrences of specified strings in the current
* string are replaced with other specified strings.
*/
public static string ReplaceBatch(this string origin, params (string toReplace, string replaceWith)[] batch)
{
if (batch.Length == 0) return origin;
else if (batch.Length == 1) return origin.Replace(batch[0].toReplace, batch[0].replaceWith);
char? prefix = batch[0].toReplace[0];
for (int i = 1; i < batch.Length; i++)
{
if (batch[i].toReplace[0] != prefix)
{
prefix = null;
break;
}
}
if (prefix != null)
{
var noPrefixBatch = new (string toReplace, string replaceWith)[batch.Length];
for (int i = 0; i < batch.Length; i++) noPrefixBatch[i] = (batch[i].toReplace[1..], batch[i].replaceWith);
return ReplaceBatch(origin, (char)prefix, noPrefixBatch);
}
ReadOnlySpan<char> tmp = origin;
var occurrences = new Queue<(int at, int task)>();
int offset = 0;
int resultLength = tmp.Length;
int taskCount = batch.Length;
Span<(int at, int task)> nexts = stackalloc (int at, int task)[taskCount];
(int at, int task) next = (tmp.Length, -1);
int tc = taskCount;
for (int i = 0, j = 0; j < tc; j++)
{
nexts[i] = (tmp.IndexOf(batch[j].toReplace), j);
if (nexts[i].at == -1)
{
taskCount--;
continue;
}
if (nexts[i].at < next.at) next = nexts[i];
i++;
}
if (next.task == -1) return origin;
while (next.task != -1)
{
occurrences.Enqueue((next.at + offset, next.task));
resultLength += batch[next.task].replaceWith.Length - batch[next.task].toReplace.Length;
int newStart = next.at + batch[next.task].toReplace.Length;
tmp = tmp.Slice(newStart);
offset += newStart;
next = (tmp.Length, -1);
tc = taskCount;
for (int i = 0, j = 0; j < tc; j++)
{
nexts[j].at -= newStart;
if (nexts[j].at < 0)
nexts[j].at = tmp.IndexOf(batch[nexts[j].task].toReplace);
if (nexts[j].at == -1)
{
taskCount--;
continue;
}
if (nexts[j].at < next.at) next = nexts[j];
if (i != j) nexts[i] = nexts[j];
i++;
}
}
return string.Create(resultLength, (origin, batch, occurrences), (chars, state) =>
{
var replaceTasks = state.batch;
var occurrences = state.occurrences;
var position = 0;
ReadOnlySpan<char> origin = state.origin;
int lastStart = 0;
while (occurrences.Count != 0)
{
(int at, int task) next = occurrences.Dequeue();
origin.Slice(lastStart, next.at - lastStart).CopyTo(chars.Slice(position));
replaceTasks[next.task].replaceWith.CopyTo(chars.Slice(position + next.at - lastStart));
position += next.at - lastStart + replaceTasks[next.task].replaceWith.Length;
lastStart = next.at + replaceTasks[next.task].toReplace.Length;
}
origin.Slice(lastStart).CopyTo(chars.Slice(position));
});
}
}
@Guiorgy
Copy link
Author

Guiorgy commented Nov 11, 2022

Example benchmarks:

Method Mean Ratio Allocated Alloc Ratio
Normal 973.4 ns 1.00 8.02 KB 1.00
ReplaceBatch 320.4 ns 0.33 1.49 KB 0.19
ReplaceBatch (preallocate 2x origin length) 259.7 ns 0.27 2.49 KB 0.31

Where Normal is defined as:

public string Normal(string origin, params (string toReplace, string replaceWith)[] batch)
{
    string result = origin;

    for (int i = 0; i < batch.Length; i++)
    {
        (string toReplace, string replaceWith) = batch[i];
        result = result.Replace(toReplace, replaceWith);
    }

    return result;
}

And the test data is:

var origin = "https://www.example.com/xxxxx?campaign={camp}&adgroup={publisher_id}&install_callback=https%3A%2F%2Fpostback.example.com%3Ftransaction%3D{transaction}&session_callback=https%3A%2F%2Fpostback.example.com%3Ftransaction%3D{aff_sub1}&affsite={aff_site}&clickid={transaction}&adset_id={creative_id}&user_agent={ua}&ip={ip}&language={lang}";

var prefix = '{';
var postfix = '}';

var batch = new (string, string)[]
{
    ("camp", "campiagn_it_banner_size_360"),
    ("publisher_id", "78983"),
    ("missing_token_1", "goV6wIODqztIaGP90JRq"),
    ("transaction", "c1032072-f815-413b-a57c-4a027f681e60"),
    ("aff_sub1", "78bea32a-6ead-4ea0-b9f2-9489ebc43d6a"),
    ("aff_site", "vbvsdgdavhdgdvjs_46_789-p90"),
    ("missing_token_2", "6rfsDEknBDBO4ZDAsgfv"),
    ("creative_id", "360x360"),
    ("ua", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"),
    ("ip", "192.168.1.1"),
    ("lang", "en"),
    ("missing_token_3", "rMuWZSs0PxXxWGtHO3GF")
};

Note: In the preallocation variant, at the end we need to trim excess null (\0) characters. At the moment this is done using the substring method, however this is an extra string allocation that we would ideally want to avoid. My attempts at doing so in-place:

// unsafe
unsafe
{
    fixed (char* p = result)
    {
        int* pi = (int*)p;
        pi[-1] = resultLength;
        // p[value] = '\0'; // should already be zero terminated anyways
    }
}
// ------------ or ------------
// reflection
private static FieldInfo _stringLenField = typeof(string).GetField("_stringLength", BindingFlags.NonPublic | BindingFlags.Instance);

StringExtensionMethods._stringLenField.SetValue(result, resultLength);

Both failed to run with Benchmarks.Net with the GC error:

Fatal error. Internal CLR error. (0x80131506)
   at System.GC.Collect()
   etc...

If anybody has a better idea, do let be know.

Edit:

Added a more general variant that doesn't require delimiters (prefix and postfix). The performance seems to be about the same as the Normal method at best and sometimes quite a bit slower, but it uses far less memory, which might be useful to some. Might be able to improve performance with a proper multi pattern search algorythm though.

Method BatchSize HitRatio Mean Ratio Allocated Alloc Ratio
Normal 5 0 327.00 ns 1.00 - NA
ReplaceBatch (Delimiters) 5 0 97.27 ns 0.30 64 B NA
ReplaceBatch 5 0 349.32 ns 1.07 40 B NA
Normal 5 0.5 609.64 ns 1.00 3296 B 1.00
ReplaceBatch (Delimiters) 5 0.5 212.10 ns 0.35 1280 B 0.39
ReplaceBatch 5 0.5 605.34 ns 0.99 1192 B 0.36
Normal 5 1 853.81 ns 1.00 6840 B 1.00
ReplaceBatch (Delimiters) 5 1 272.13 ns 0.32 1624 B 0.24
ReplaceBatch 5 1 742.01 ns 0.87 1528 B 0.22
Normal 100 0 6,547.74 ns 1.00 - NA
ReplaceBatch (Delimiters) 100 0 1,043.99 ns 0.16 64 B NA
ReplaceBatch 100 0 6,538.11 ns 1.00 40 B NA
Normal 100 0.5 39,037.89 ns 1.00 369776 B 1.00
ReplaceBatch (Delimiters) 100 0.5 4,855.42 ns 0.12 8536 B 0.02
ReplaceBatch 100 0.5 52,686.76 ns 1.35 8512 B 0.02
Normal 100 1 106,602.92 ns 1.00 1420912 B 1.00
ReplaceBatch (Delimiters) 100 1 14,026.49 ns 0.13 16264 B 0.01
ReplaceBatch 100 1 114,512.38 ns 1.08 16184 B 0.01

Edit2:

Modified the generic method to detect when all patterns start with the same prefix and use the faster method instead.

Method BatchSize Existing Mean Ratio Allocated Alloc Ratio
Normal 5 0.5 716.2 ns 1.00 3.2 KB 1.00
New 5 0.5 441.9 ns 0.62 1.58 KB 0.49
Old 5 0.5 676.8 ns 0.95 1.16 KB 0.36
Normal 5 1 958.8 ns 1.00 6.61 KB 1.00
New 5 1 533.7 ns 0.56 1.92 KB 0.29
Old 5 1 837.8 ns 0.87 1.51 KB 0.23
Normal 100 0.5 48,817.1 ns 1.00 363.63 KB 1.00
New 100 0.5 10,555.5 ns 0.22 14.63 KB 0.04
Old 100 0.5 57,665.8 ns 1.18 8.24 KB 0.02
Normal 100 1 142,294.1 ns 1.00 1386.58 KB 1.00
New 100 1 24,668.7 ns 0.19 22.09 KB 0.02
Old 100 1 119,944.4 ns 0.92 15.81 KB 0.01

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment