-
-
Save etemesi254/b8e328a15baab2280b48a01f3861f4f3 to your computer and use it in GitHub Desktop.
// | |
// Created by caleb on 6/19/22. | |
// | |
#include "esa_matchfinder.h" | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
long long multi_pass_optimal_parse(const char *buffer, int size) { | |
long long total_matches = 0; | |
void *mf = esa_matchfinder_create(size, /*min_match_length*/ 2, | |
/*max_match_length*/ 256); | |
if (mf != NULL && | |
esa_matchfinder_parse(mf, buffer, size) == ESA_MATCHFINDER_NO_ERROR) { | |
for (int pass = 0; pass < 2; pass += 1) { | |
ESA_MATCHFINDER_MATCH matches[ESA_MATCHFINDER_MAX_MATCH_LENGTH]; | |
esa_matchfinder_rewind(mf, /*position*/ 0); | |
ESA_MATCHFINDER_MATCH mt; | |
// int pos = 0; | |
for (int position = 0; position < size;) { | |
mt = esa_matchfinder_find_best_match(mf); | |
if (mt.length != 0) { | |
printf("pos=%d \t length=%d \t offset=%d\n", position, mt.length, | |
mt.offset); | |
} | |
position += 1; | |
position += mt.length; | |
esa_matchfinder_advance(mf, mt.length); | |
// total_matches += esa_matchfinder_find_all_matches(mf, matches) | |
// - matches; | |
} | |
} | |
} | |
esa_matchfinder_destroy(mf); | |
return total_matches; | |
} | |
typedef struct Buf { | |
size_t size; | |
char *buf; | |
} Buf; | |
Buf open(const char *file_name) { | |
char *source = NULL; | |
Buf buf; | |
FILE *fp = fopen(file_name, "r"); | |
if (fp != NULL) { | |
/* Go to the end of the file. */ | |
if (fseek(fp, 0L, SEEK_END) == 0) { | |
/* Get the size of the file. */ | |
long bufsize = ftell(fp); | |
if (bufsize == -1) { /* Error */ | |
} | |
buf.size = bufsize; | |
/* Allocate our buffer to that size. */ | |
source = malloc(sizeof(char) * (bufsize + 1)); | |
/* Go back to the start of the file. */ | |
if (fseek(fp, 0L, SEEK_SET) != 0) { /* Error */ | |
} | |
/* Read the entire file into memory. */ | |
size_t newLen = fread(source, sizeof(char), bufsize, fp); | |
if (ferror(fp) != 0) { | |
fputs("Error reading file", stderr); | |
} else { | |
source[newLen++] = '\0'; /* Just to be safe. */ | |
} | |
} | |
fclose(fp); | |
} | |
buf.buf = source; | |
return buf; | |
} | |
int main() { | |
// Your file HERE. | |
Buf buf = open("/home/caleb/Projects/Data/silesia/xml"); | |
printf("%lld ,", multi_pass_optimal_parse(buf.buf, buf.size)); | |
free(buf.buf); | |
} |
Hi, I do not claim the code is 100% accurate, I hacked it in some few minutes to see some statistics for myself
It's probably buggy/error prone
Why is there `position += 1;' in line 31? Don't you miss one byte every time you search?
It was to avoid redundancies when mt didn't find a match, hence it would be 0, notice I'm maintingin my own position variable independent of the esa_matchfinder_advancements, so there was no way of me knowing they advanced
Also, in line 34 you advance by
mt.length'. How is that take into account that
esa_matchfinder_find_best_match()' advances by 1 byte on it's own?
It is taken into account, by position+=1
, the matchfinder advances by 1, we also advance by 1, so they remain in sync, we loose matches at some positions due to this tho
I try to graps this thing and, therefore, have two questions: