Skip to content

Instantly share code, notes, and snippets.

@celic
Last active August 4, 2016 20:51
Show Gist options
  • Save celic/bbb0c372782453405ae5a64eda852d31 to your computer and use it in GitHub Desktop.
Save celic/bbb0c372782453405ae5a64eda852d31 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <map>
using namespace std;
void find_substrings(int substr_len, string &text, map<string, vector<int>> &indexes){
// First iteration
if(substr_len == 2){
// Store all 2-tuples that appear in the text
for(int i = 0; i < len-(substr_len-1); i++){
indexes[text.substr(i, substr_len)].push_back(i);
}
// All other iterations, don't just find all n-tuples naively like 2-tuples
// Any (n+1)-tuple must build upon an n-tuple, so just take the existing indexes and build on them
}else{
vector<int> good_indexes;
map<string, vector<int>>::iterator itr;
// Store all the indexes of n-tuple substrings that occur more than once
for(itr = indexes.begin(); itr != indexes.end(); ++itr){
for(int i = 0; i < itr->second.size(); ++i){
good_indexes.push_back(itr->second[i]);
}
}
indexes.clear();
// Extend the n-tuples to (n+1)-tuples and store like before
for(int i = 0; i < good_indexes.size(); i++){
indexes[text.substr(good_indexes[i], substr_len)].push_back(good_indexes[i]);
}
}
}
void erase_substrings(map<string, vector<int>> &indexes){
// Prune the map of any substrings that occur only once
map<string, vector<int>>::iterator itr = indexes.begin();
while(itr != indexes.end()){
if(itr->second.size() < 2){
indexes.erase(itr++);
}else{
++itr;
}
}
}
int len_LRS(string text){
// String is the substring we are looking at, vector stores the indexes those substrings begin at
map<string, vector<int>> indexes;
int substr_len = 2;
// Progressively grow the length of the n-tuples to look for
while(true){
find_substrings(substr_len, indexes);
erase_substrings(indexes);
if(indexes.empty()) break;
substr_len++;
}
// We always advance one further than we need to
return substr_len-1;
}
int main(){
ifstream file;
string text;
string line;
file.open("moby_dick.txt");
while(getline(file, line)) text += line;
file.close();
//text = "abcdeabcdeabc";
cout << len_LRS(text) << endl;
return 0;
}
@celic
Copy link
Author

celic commented Aug 4, 2016

Memory sensitive longest repeated substring algorithm. Rather than use the O(n^2)-memory dynamic programming approach, this takes a slightly slower route to conserve memory allowing you to run the algorithm on all of Moby Dick, for example, in a short amount of time.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment