Last active
November 15, 2021 14:04
-
-
Save JackyYin/1e7474ffd01f89bc94010efa63a77562 to your computer and use it in GitHub Desktop.
A simple algorithm to find and trim long repetitive substring of string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <unistd.h> | |
#include <string.h> | |
#include <vector> | |
#include <unordered_set> | |
#include <unordered_map> | |
#include <iostream> | |
#define CHECK_REPEAT_THRESHOLD 1000 | |
#define REPEAT_CHAR_THRESHOLD 200 | |
#define MAX_CHAR_CATE_CNT 10 | |
static void trim_long_repetitive_string(uint8_t *str, size_t *strlen) | |
{ | |
std::vector<int> lastPos(256, -1); | |
std::unordered_set<int> cset; | |
std::unordered_map<int, int> peerIdx; | |
std::vector<std::pair<int, int> > res; | |
int i = 1, j = 0, cnt = 0; | |
lastPos[str[0]] = 0; | |
while (i < *strlen) { | |
if (str[i] == str[j]) { | |
++cnt; | |
cset.insert(str[i]); | |
peerIdx[i] = j; | |
i++; | |
j++; | |
} else { | |
if (cnt > REPEAT_CHAR_THRESHOLD && cset.size() < MAX_CHAR_CATE_CNT) { | |
std::cout << "peerIdx: " << peerIdx[i - cnt] << std::endl; | |
res.push_back(std::make_pair(peerIdx[i - cnt], i - 1)); | |
} | |
cnt = 0; | |
cset.clear(); | |
if (lastPos[str[i]] >= 0) { | |
j = lastPos[str[i]]; | |
} else if (j == 0) { | |
i++; | |
} else { | |
j = 0; | |
} | |
} | |
lastPos[str[i - 1]] = i - 1; | |
} | |
if (cnt > REPEAT_CHAR_THRESHOLD && cset.size() < MAX_CHAR_CATE_CNT) { | |
std::cout << "peerIdx: " << peerIdx[i - cnt] << std::endl; | |
res.push_back(std::make_pair(peerIdx[i - cnt], i - 1)); | |
} | |
/* for (auto &p : res) { */ | |
/* std::cout << "matched len: " << p.second - p.first + 1 << std::endl; */ | |
/* std::cout << "matched text: " << std::endl; */ | |
/* for (int i = p.first; i <= p.second; i++) { */ | |
/* std::cout << str[i]; */ | |
/* } */ | |
/* std::cout << std::endl; */ | |
/* } */ | |
int srcIdx = 0; | |
int dstlen = 0; | |
for (auto &p : res) { | |
if (str + p.first > str + srcIdx) { | |
memcpy(&str[dstlen], &str[srcIdx], p.first - srcIdx); | |
dstlen += (p.first - srcIdx); | |
} | |
srcIdx = p.second + 1; | |
} | |
if (srcIdx < *strlen - 1) { | |
memcpy(&str[dstlen], &str[srcIdx], *strlen - srcIdx); | |
dstlen += (*strlen - srcIdx); | |
} | |
if (dstlen > 0) | |
*strlen = dstlen; | |
/* std::ofstream myfile; */ | |
/* myfile.open ("sample.txt"); */ | |
/* myfile << "sample: " << std::endl; */ | |
/* myfile << "---------------------------------------" << std::endl; */ | |
/* myfile << std::string(str, *strlen); */ | |
/* myfile.close(); */ | |
std::cout << "trimmed len: " << *strlen << std::endl;; | |
std::cout << "trimmed: " << std::endl;; | |
for (int i = 0; i < *strlen; i++) { | |
std::cout << str[i]; | |
} | |
std::cout << std::endl; | |
} | |
int main(int argc, char **argv) { | |
int opt; | |
while ((opt = getopt(argc, argv, "i:f:")) != -1) { | |
switch (opt) { | |
case 'i': | |
{ | |
size_t fbuflen = strlen(optarg); | |
char *tmp = (char *)malloc(fbuflen); | |
memcpy(tmp, optarg, fbuflen); | |
trim_long_repetitive_string((uint8_t *)tmp, &fbuflen); | |
free(tmp); | |
goto SUCCESS; | |
} | |
case 'f': | |
{ | |
FILE *fp; | |
char *fbuf; | |
if (!(fp = fopen(optarg, "rb"))) { | |
goto FAILED; | |
} | |
fseek(fp, 0L, SEEK_END); | |
size_t flen = ftell(fp); | |
fseek(fp, 0L, SEEK_SET); | |
if (!(fbuf = (char *)malloc(flen + 1))) { | |
goto FAILED; | |
} | |
if (fread(fbuf, 1, flen, fp) != flen) { | |
free(fbuf); | |
goto FAILED; | |
} | |
fclose(fp); | |
fbuf[flen] = '\0'; | |
size_t fbuflen = strlen(fbuf); | |
char *tmp = (char *)malloc(fbuflen); | |
memcpy(tmp, fbuf, fbuflen); | |
trim_long_repetitive_string((uint8_t *)tmp, &fbuflen); | |
free(tmp); | |
free(fbuf); | |
goto SUCCESS; | |
} | |
case '?': | |
default: | |
goto ARG_ERR; | |
} | |
} | |
ARG_ERR: | |
printf("argument error!\n"); | |
return -2; | |
FAILED: | |
printf("something went wrong!\n"); | |
return -1; | |
SUCCESS: | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment