Last active
December 2, 2023 19:34
-
-
Save etemesi254/b8e328a15baab2280b48a01f3861f4f3 to your computer and use it in GitHub Desktop.
ESA match finder with some additional code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Created by caleb on 6/19/22. | |
// | |
#include "esa_matchfinder.h" | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
long long multi_pass_optimal_parse(const char *buffer, int size) { | |
long long total_matches = 0; | |
void *mf = esa_matchfinder_create(size, /*min_match_length*/ 2, | |
/*max_match_length*/ 256); | |
if (mf != NULL && | |
esa_matchfinder_parse(mf, buffer, size) == ESA_MATCHFINDER_NO_ERROR) { | |
for (int pass = 0; pass < 2; pass += 1) { | |
ESA_MATCHFINDER_MATCH matches[ESA_MATCHFINDER_MAX_MATCH_LENGTH]; | |
esa_matchfinder_rewind(mf, /*position*/ 0); | |
ESA_MATCHFINDER_MATCH mt; | |
// int pos = 0; | |
for (int position = 0; position < size;) { | |
mt = esa_matchfinder_find_best_match(mf); | |
if (mt.length != 0) { | |
printf("pos=%d \t length=%d \t offset=%d\n", position, mt.length, | |
mt.offset); | |
} | |
position += 1; | |
position += mt.length; | |
esa_matchfinder_advance(mf, mt.length); | |
// total_matches += esa_matchfinder_find_all_matches(mf, matches) | |
// - matches; | |
} | |
} | |
} | |
esa_matchfinder_destroy(mf); | |
return total_matches; | |
} | |
typedef struct Buf { | |
size_t size; | |
char *buf; | |
} Buf; | |
Buf open(const char *file_name) { | |
char *source = NULL; | |
Buf buf; | |
FILE *fp = fopen(file_name, "r"); | |
if (fp != NULL) { | |
/* Go to the end of the file. */ | |
if (fseek(fp, 0L, SEEK_END) == 0) { | |
/* Get the size of the file. */ | |
long bufsize = ftell(fp); | |
if (bufsize == -1) { /* Error */ | |
} | |
buf.size = bufsize; | |
/* Allocate our buffer to that size. */ | |
source = malloc(sizeof(char) * (bufsize + 1)); | |
/* Go back to the start of the file. */ | |
if (fseek(fp, 0L, SEEK_SET) != 0) { /* Error */ | |
} | |
/* Read the entire file into memory. */ | |
size_t newLen = fread(source, sizeof(char), bufsize, fp); | |
if (ferror(fp) != 0) { | |
fputs("Error reading file", stderr); | |
} else { | |
source[newLen++] = '\0'; /* Just to be safe. */ | |
} | |
} | |
fclose(fp); | |
} | |
buf.buf = source; | |
return buf; | |
} | |
int main() { | |
// Your file HERE. | |
Buf buf = open("/home/caleb/Projects/Data/silesia/xml"); | |
printf("%lld ,", multi_pass_optimal_parse(buf.buf, buf.size)); | |
free(buf.buf); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I do not claim the code is 100% accurate, I hacked it in some few minutes to see some statistics for myself
It's probably buggy/error prone
It was to avoid redundancies when mt didn't find a match, hence it would be 0, notice I'm maintingin my own position variable independent of the esa_matchfinder_advancements, so there was no way of me knowing they advanced
It is taken into account, by
position+=1
, the matchfinder advances by 1, we also advance by 1, so they remain in sync, we loose matches at some positions due to this tho