Skip to content

Instantly share code, notes, and snippets.

@markpapadakis
Last active June 21, 2020 09:41
Show Gist options
  • Save markpapadakis/9e9e156fa8a4f9b97b62 to your computer and use it in GitHub Desktop.
Save markpapadakis/9e9e156fa8a4f9b97b62 to your computer and use it in GitHub Desktop.
From input string, identify all strings within a specific edit distance from it - based on deletions only.
// Compile list of tokens based on deletions, and respecting max edit distance
// Original implementation would concatenate left/right string segments using memcpy() etc, but that didn't seem right
// The idea is, to instead partition S into 1+ segments and build hash from them, no need for memory copies etc.
// This is almost x8 times faster compared to that - but it's still in the 10s of microseconds range.
//
// ( for a 31 characters long string, with max edit distance = 2, it takes 24 microseconds, including the cost for computing the FNV rolling hash )
#include <switch.h>
#include <switch_print.h>
#include <ansifmt.h>
static void Div(strwlen8_t S, uint8_t midsCnt, const uint8_t *const mids, const uint8_t ed, const uint8_t maxEditDistance)
{
uint8_t idx{0}, localMids[4];
char repr[64], *r{repr};
const auto SL = S.len;
uint64_t h;
// copy and sort, fast
switch (midsCnt)
{
case 0:
break;
case 1:
localMids[0] = mids[0];
break;
case 2:
if (mids[0] <= mids[1])
{
localMids[0] = mids[0];
localMids[1] = mids[1];
}
else
{
localMids[0] = mids[1];
localMids[1] = mids[0];
}
break;
case 3:
if (mids[0] <= mids[1])
{
if (mids[0] <= mids[2])
{
localMids[0] = mids[0];
localMids[1] = Min(mids[1], mids[2]);
localMids[2] = Max(mids[1], mids[2]);
}
else
{
localMids[0] = mids[2];
localMids[1] = Min(mids[1], mids[0]);
localMids[2] = Max(mids[1], mids[0]);
}
}
else
{
localMids[0] = mids[1];
localMids[1] = Min(mids[2], mids[0]);
localMids[2] = Max(mids[2], mids[0]);
}
break;
default:
Abort();
}
h = BeginFNVHash64();
for (uint8_t i = 0; i != midsCnt; ++i)
{
const auto sep = localMids[i];
const auto s = S.Substr(idx, sep - idx);
h = FNVHash64(h, (uint8_t *)s.p, s.len);
idx = sep + 1;
}
if (idx < SL)
{
const auto s = S.SuffixFrom(idx);
h = FNVHash64(h, (uint8_t *)s.p, s.len);
}
Consider(h);
if (ed == maxEditDistance)
return;
if (!midsCnt)
{
if (SL > 1)
{
const auto upto = SL - 1;
Div(S.SuffixFrom(1), 0, localMids, ed + 1, maxEditDistance);
for (uint8_t i = 1; i != upto; ++i)
{
localMids[0] = i;
Div(S, 1, localMids, ed + 1, maxEditDistance);
}
Div(S.Prefix(upto), 0, localMids, ed + 1, maxEditDistance);
}
}
else
{
uint8_t i{0}, idx{0};
// Optimization:
// Drop first if they are in sequence
for (i = 0; i != midsCnt && localMids[i] == i; ++i)
continue;
if (i)
{
midsCnt-=i;
switch (i)
{
case 3:
localMids[2] = --localMids[3];
case 2:
localMids[1] = --localMids[2];
case 1:
localMids[0] = --localMids[1];
break;
default:
Abort();
}
S.StripPrefix(i);
i = 0;
}
for (; i < midsCnt; ++i)
{
const auto sep = localMids[i];
while (idx != sep)
{
localMids[midsCnt] = idx;
Div(S, midsCnt + 1, localMids, ed + 1, maxEditDistance);
++idx;
}
idx = sep + 1;
}
}
}
static void Div(strwlen8_t S, const uint8_t maxEditDistance)
{
// We want to exclude the input query from consideration, so we 'll just do that here
uint8_t localMids[4];
const auto before = Timings::Microseconds::Tick();
const auto SL = S.len;
if (SL > 1)
{
const auto upto = SL - 1;
Div(S.SuffixFrom(1), 0, localMids, 1, maxEditDistance);
for (uint8_t i = 1; i != upto; ++i)
{
localMids[0] = i;
Div(S, 1, localMids, 1, maxEditDistance);
}
Div(S.Prefix(upto), 0, localMids, 1, maxEditDistance);
}
SLog(duration_repr(Timings::Microseconds::Since(before)), "\n");
}
int main(int argc, char *argv[])
{
const strwlen8_t in(argv[1]);
Div(in, Min<uint8_t>(2, in.len / 5 + 1));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment