Created
October 20, 2023 07:20
-
-
Save Kristonitas/0cfdc576ffb5d2b5076e5a2500d50131 to your computer and use it in GitHub Desktop.
Search an input string for longest substring that contains a limited set of unique characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Stores the result match, can be used to store best current match | |
type Match = { | |
// the chars that are in the longest consecutive sequence (char[]) | |
chars: string; | |
// start index in original sequence (size_t) | |
startIndex: number; | |
// length of the matching sequence (size_t) | |
length: number; | |
}; | |
const searchConsecutive = (text: string, numChars: number): Match => { | |
/** | |
* Lets say we are searching for the longest sequence of any 3 characters. | |
* When evaluating a certain position in the sequence, that position can be either the part of the longest sub-sequence (containing only 3 characters), or not. | |
* Assume the longest sequence contains characters "xyz" and we are evaluating the position in the sequence that is a part of the longest sequence. | |
* When we go forwards in the sequence and meet a character thats neither of the "xyz", we know that we are at the end of the sequence. What is unique about that position? | |
* Current position is one of characters [xyz] and the next is not (![xyz]). Similarly if we go backwards in sequence | |
* The idea behind this search algorithm is that only positions where neighboring characters are different is important. | |
* When going forward its important when a unseen-new character appears first, or when a particular character appears for the last time (from before) | |
* | |
* When going from the start, lets index each character by its appearance in the sequence: | |
* 11121221331214... | |
* The moment "4" appears, the longest sequence with characters 123 is done. We then need to evaluate sequence with character 4 and two characters from the set of [123]. | |
* Which one? Going back from the initial appearance of number 4, we see numbers: 412133... Since number 4 is in the sequence and we must evaluate the consecutive sequence | |
* it will be numbers 124. At that point, we need to establish the longest sequence with characters 124, that started at position, right after last appearance of 3, it will be | |
* 1214... | |
* Compare the length of the subsequence containing characters 124 with the length of the previous candidate (containing characters 123), if its longer, promote it to the potential result | |
* At this point, 1112122133 can be forgotten about, and the sequence 1214... can be treated as the initial problem (with the caveat of tracking the longest potential result) | |
* When hitting end of string, return existing longest potential result, if none is established at this point (looking for N+ unique character sequence when original seuqnce | |
* has only N unique chars), the original pattern mathces the question, return it | |
*/ | |
// To evaluate (access) each character only once, keeping last appearances of each character | |
type CharacterAppearanceInfo = { | |
// character for which this info applies (char) | |
char: string; | |
// up until evaluate character in sequence, where was the last time character was used | |
lastAppearance: number; | |
}; | |
let match: Match | undefined; | |
let searchStart = 0; | |
let searchStack: CharacterAppearanceInfo[] = []; | |
for (let i = 0; i < text.length; i++) { | |
// console.log(match); | |
const evaluatedCharacter = text[i]; | |
const infoInStack = searchStack.find((s) => s.char == evaluatedCharacter); | |
// already in search stack, update last position | |
if (infoInStack) { | |
infoInStack.lastAppearance = i; | |
continue; | |
} | |
// search stack not yet saturated character limit | |
if (searchStack.length < numChars) { | |
searchStack.push({ | |
char: evaluatedCharacter, | |
lastAppearance: i, | |
}); | |
continue; | |
} | |
// console.log({ i, searchStack, evaluatedCharacter }); | |
// found new character that is not in stack and stack needs to be updated | |
// oh boy... | |
// convert existing stack into a potential match and compare it | |
const newMatch: Match = { | |
chars: searchStack.map((s) => s.char).join(""), | |
startIndex: searchStart, | |
length: i - searchStart, // tricky to get this right, but since i is already advanced, no need to increment length by + 1 | |
}; | |
// change this to something if you need all candidates | |
// this will return first candidate in sequence | |
if (!match || newMatch.length > match.length) { | |
match = newMatch; | |
} | |
// if we would to reverse the problem at this point, we would be checking where the sub-sequence finds the unexpected char, | |
// it would be where the char with the earliest `lastAppearance` appeared. So if we go forward, that char needs to be dropped | |
searchStack.sort((s) => s.lastAppearance); | |
searchStart = searchStack[0].lastAppearance + 1; | |
searchStack.shift(); | |
searchStack.push({ | |
char: evaluatedCharacter, | |
lastAppearance: i, | |
}); | |
// console.log({ i, searchStack, evaluatedCharacter }); | |
} | |
// need to make a match after evaluting last character | |
// this will also work as a match if sequence contained less characters than the search limit | |
const newMatch: Match = { | |
chars: searchStack.map((s) => s.char).join(""), | |
startIndex: searchStart, | |
length: text.length - searchStart, | |
}; | |
if (!match || newMatch.length > match.length) { | |
match = newMatch; | |
} | |
return match; | |
}; | |
const test = (text: string, numChars: number) => { | |
const match = searchConsecutive(text, numChars); | |
const result = text.substring( | |
match.startIndex, | |
match.startIndex + match.length | |
); | |
const highlightedText = | |
text.substring(0, match.startIndex) + | |
"\x1b[33m" + | |
result + | |
"\x1b[37m" + | |
text.substring(match.startIndex + match.length); | |
console.log( | |
`longest ${numChars} character sequence from ${highlightedText} is ${result}` | |
); | |
}; | |
test("abc", 1); | |
test("abc", 2); | |
test("abc", 3); | |
test("abc", 4); | |
test("aaaabccccddd", 1); | |
test("aaaabccccddd", 2); | |
test("aaaabccccddd", 3); | |
test("aaaabccccddd", 4); | |
test("aaaabccccddd", 5); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment