Skip to content

Instantly share code, notes, and snippets.

@JackyYin
Last active November 15, 2021 14:04
Show Gist options
  • Save JackyYin/1e7474ffd01f89bc94010efa63a77562 to your computer and use it in GitHub Desktop.
Save JackyYin/1e7474ffd01f89bc94010efa63a77562 to your computer and use it in GitHub Desktop.
A simple algorithm to find and trim long repetitive substring of string
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <iostream>
#define CHECK_REPEAT_THRESHOLD 1000
#define REPEAT_CHAR_THRESHOLD 200
#define MAX_CHAR_CATE_CNT 10
static void trim_long_repetitive_string(uint8_t *str, size_t *strlen)
{
std::vector<int> lastPos(256, -1);
std::unordered_set<int> cset;
std::unordered_map<int, int> peerIdx;
std::vector<std::pair<int, int> > res;
int i = 1, j = 0, cnt = 0;
lastPos[str[0]] = 0;
while (i < *strlen) {
if (str[i] == str[j]) {
++cnt;
cset.insert(str[i]);
peerIdx[i] = j;
i++;
j++;
} else {
if (cnt > REPEAT_CHAR_THRESHOLD && cset.size() < MAX_CHAR_CATE_CNT) {
std::cout << "peerIdx: " << peerIdx[i - cnt] << std::endl;
res.push_back(std::make_pair(peerIdx[i - cnt], i - 1));
}
cnt = 0;
cset.clear();
if (lastPos[str[i]] >= 0) {
j = lastPos[str[i]];
} else if (j == 0) {
i++;
} else {
j = 0;
}
}
lastPos[str[i - 1]] = i - 1;
}
if (cnt > REPEAT_CHAR_THRESHOLD && cset.size() < MAX_CHAR_CATE_CNT) {
std::cout << "peerIdx: " << peerIdx[i - cnt] << std::endl;
res.push_back(std::make_pair(peerIdx[i - cnt], i - 1));
}
/* for (auto &p : res) { */
/* std::cout << "matched len: " << p.second - p.first + 1 << std::endl; */
/* std::cout << "matched text: " << std::endl; */
/* for (int i = p.first; i <= p.second; i++) { */
/* std::cout << str[i]; */
/* } */
/* std::cout << std::endl; */
/* } */
int srcIdx = 0;
int dstlen = 0;
for (auto &p : res) {
if (str + p.first > str + srcIdx) {
memcpy(&str[dstlen], &str[srcIdx], p.first - srcIdx);
dstlen += (p.first - srcIdx);
}
srcIdx = p.second + 1;
}
if (srcIdx < *strlen - 1) {
memcpy(&str[dstlen], &str[srcIdx], *strlen - srcIdx);
dstlen += (*strlen - srcIdx);
}
if (dstlen > 0)
*strlen = dstlen;
/* std::ofstream myfile; */
/* myfile.open ("sample.txt"); */
/* myfile << "sample: " << std::endl; */
/* myfile << "---------------------------------------" << std::endl; */
/* myfile << std::string(str, *strlen); */
/* myfile.close(); */
std::cout << "trimmed len: " << *strlen << std::endl;;
std::cout << "trimmed: " << std::endl;;
for (int i = 0; i < *strlen; i++) {
std::cout << str[i];
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
int opt;
while ((opt = getopt(argc, argv, "i:f:")) != -1) {
switch (opt) {
case 'i':
{
size_t fbuflen = strlen(optarg);
char *tmp = (char *)malloc(fbuflen);
memcpy(tmp, optarg, fbuflen);
trim_long_repetitive_string((uint8_t *)tmp, &fbuflen);
free(tmp);
goto SUCCESS;
}
case 'f':
{
FILE *fp;
char *fbuf;
if (!(fp = fopen(optarg, "rb"))) {
goto FAILED;
}
fseek(fp, 0L, SEEK_END);
size_t flen = ftell(fp);
fseek(fp, 0L, SEEK_SET);
if (!(fbuf = (char *)malloc(flen + 1))) {
goto FAILED;
}
if (fread(fbuf, 1, flen, fp) != flen) {
free(fbuf);
goto FAILED;
}
fclose(fp);
fbuf[flen] = '\0';
size_t fbuflen = strlen(fbuf);
char *tmp = (char *)malloc(fbuflen);
memcpy(tmp, fbuf, fbuflen);
trim_long_repetitive_string((uint8_t *)tmp, &fbuflen);
free(tmp);
free(fbuf);
goto SUCCESS;
}
case '?':
default:
goto ARG_ERR;
}
}
ARG_ERR:
printf("argument error!\n");
return -2;
FAILED:
printf("something went wrong!\n");
return -1;
SUCCESS:
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment