-
-
Save easyaspi314/07e667636fde11216dae190f03d362c6 to your computer and use it in GitHub Desktop.
nano regex_hash.c demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/************************************************************************** | |
* regex_hash.c -- This file is part of GNU nano. * | |
* * | |
* Copyright (C) 2018 Free Software Foundation, Inc. * | |
* Copyright (C) 2018 Benno Schulenberg * | |
* * | |
* GNU nano is free software: you can redistribute it and/or modify * | |
* it under the terms of the GNU General Public License as published * | |
* by the Free Software Foundation, either version 3 of the License, * | |
* or (at your option) any later version. * | |
* * | |
* GNU nano is distributed in the hope that it will be useful, * | |
* but WITHOUT ANY WARRANTY; without even the implied warranty * | |
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * | |
* See the GNU General Public License for more details. * | |
* * | |
* You should have received a copy of the GNU General Public License * | |
* along with this program. If not, see http://www.gnu.org/licenses/. * | |
* * | |
**************************************************************************/ | |
/* Undefine this to remove the debug logging. */ | |
#define DEBUG 1 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <regex.h> | |
#ifdef DEBUG | |
#include <assert.h> | |
#endif | |
/* Wrap stuff that would be in nano's code to make this standalone. */ | |
/* #include "proto.h" */ | |
#define TRUE true | |
#define FALSE false | |
#define nmalloc malloc | |
#define mallocstrcpy(x, y) strdup((y)) | |
#define statusline(x, y, ...) fprintf(stderr, y, ##__VA_ARGS__), putc('\n', stderr) | |
#define fixbounds(x) (x) | |
#define N_(x) (x) | |
/* A regex_t wrapper. We store nregex_t in a hash map to avoid endless regex | |
* recompilations. */ | |
typedef struct nregex_type { | |
regex_t regex; | |
/* The compiled regex itself. */ | |
char *regex_str; | |
/* The regex string. This is strcmp'd to check for collisions. */ | |
struct nregex_type *next; | |
/* Next in the list, in case we have collisions. */ | |
struct nregex_type *prev; | |
/* Do we need this? */ | |
int refcount; | |
/* A ref counter to tell when we can free this regex_t. Not sure | |
* whether to keep it right now. */ | |
uint32_t hash; | |
/* The actual hash */ | |
int flags; | |
/* The regex flags */ | |
bool initialized; | |
/* Whether the regex was initialized or not */ | |
bool trivial; | |
/* Whether this is a trivial string and we don't have to waste our | |
* time with actual regex. */ | |
} nregex_t; | |
/* Must be a power of 2 for performance! */ | |
#define REGEX_HASHMAP_SIZE 63u | |
static nregex_t *regex_hashmap[REGEX_HASHMAP_SIZE]; | |
/* Finds an nregex_t from the hash map and returns it. | |
* If there is no match, either a new nregex_t is created (create == TRUE), | |
* or a null pointer is returned. */ | |
static nregex_t *find_regex(const char *restrict regex_str, int flags, bool create) | |
{ | |
nregex_t *prev = NULL; | |
nregex_t *nregex; | |
/* FNV-1a hashing algorithm. */ | |
uint32_t hash = 2166136261u; | |
{ | |
const uint8_t *restrict data = (const uint8_t *)regex_str; | |
while (*data) | |
hash = (hash ^ *data++) * 16777619u; | |
} | |
/* Go through the matches. We have to check because FNV-1a is not perfect, | |
* and we also only have so many unique keys. */ | |
nregex = regex_hashmap[hash % REGEX_HASHMAP_SIZE]; | |
for (; nregex != NULL; nregex = nregex->next) { | |
/* Check if we already have an identical regex, and return it if so. */ | |
if (nregex->regex_str != NULL | |
&& nregex->hash == hash | |
&& nregex->flags == flags | |
&& strcmp(nregex->regex_str, regex_str) == 0) { | |
return nregex; | |
} | |
#ifdef DEBUG | |
fprintf(stderr, "\"%s\" (flags %d, hash 0x%x) collides with \"%s\" (flags %d, hash 0x%x)\n", | |
regex_str, flags, hash, nregex->regex_str, nregex->flags, nregex->hash); | |
#endif | |
/* Collision or same regex with different flags. Store nregex as the | |
* last one so we can potentially link it in to the hash list. */ | |
prev = nregex; | |
} | |
#ifdef DEBUG | |
assert(nregex == NULL); | |
#endif | |
/* Create a new nregex_t if it doesn't exist. */ | |
if (create) { | |
nregex = (nregex_t *)nmalloc(sizeof(nregex_t)); | |
nregex->regex_str = mallocstrcpy(NULL, regex_str); | |
if (prev != NULL) { | |
/* Link it with the previous */ | |
prev->next = nregex; | |
nregex->prev = prev; | |
} else { | |
/* Put it at the top of the list */ | |
nregex->prev = NULL; | |
regex_hashmap[hash % REGEX_HASHMAP_SIZE] = nregex; | |
} | |
nregex->next = NULL; | |
nregex->refcount = 0; | |
nregex->hash = hash; | |
nregex->flags = flags; | |
nregex->initialized = FALSE; | |
/* Check for regex metacharacters. We don't bother checking for escapes, | |
* as checking for proper regex...needs regex, and we also have to | |
* modify the string in order to remove double backslashes. */ | |
nregex->trivial = (strpbrk(regex_str, "[](){}.*+?^\\|") == NULL); | |
#ifdef DEBUG | |
if (nregex->trivial) | |
fprintf(stderr, "Creating trivial regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags); | |
else | |
fprintf(stderr, "Creating regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags); | |
#endif | |
return nregex; | |
} | |
return NULL; | |
} | |
/* Returns a nregex_t generated from the arguments. Either this will return | |
* a previously generated nregex_t, or it will create a new one. | |
* On an error, the error message is statuslined, and this will return NULL. */ | |
nregex_t *nano_regcomp(const char *restrict regex_str_raw, int flags) | |
{ | |
/* Fix word boundaries first */ | |
const char *regex_str = fixbounds(regex_str_raw); | |
/* Find the regex we need */ | |
nregex_t *restrict nregex = find_regex(regex_str, flags, TRUE); | |
/* Inrease the refcount */ | |
nregex->refcount++; | |
/* Only compile the regex if the string is not trivial and it hasn't | |
* already been compiled. */ | |
if (!nregex->trivial && !nregex->initialized) { | |
int rc = regcomp(&nregex->regex, regex_str, flags); | |
/* Error */ | |
if (rc != 0) { | |
/* 128 bytes should be more than enough. */ | |
char buf[128]; | |
size_t len = regerror(rc, &nregex->regex, buf, 128); | |
statusline(ALERT, N_("Bad regex \"%s\": %s"), regex_str, buf); | |
free(nregex->regex_str); | |
free(nregex); | |
return NULL; | |
} | |
} | |
return nregex; | |
} | |
/* Frees a nregex_t if there are no more references to it. | |
* Returns TRUE if the nregex was actually freed. */ | |
bool nano_regfree(nregex_t *restrict nregex) | |
{ | |
/* We don't do anything if nregex is NULL or if we still have refs. */ | |
if (nregex == NULL || --(nregex->refcount) > 0) | |
return FALSE; | |
#ifdef DEBUG | |
fprintf(stderr, "Freeing regex \"%s\", flags %d, hash 0x%x\n", nregex->regex_str, nregex->flags, nregex->hash); | |
#endif | |
if (!nregex->trivial && nregex->initialized) { | |
regfree(&nregex->regex); | |
} | |
/* We have a parent */ | |
if (nregex->prev != NULL) { | |
nregex->prev->next = nregex->next; | |
if (nregex->next != NULL) { | |
nregex->next->prev = nregex->prev; | |
} | |
} else { | |
regex_hashmap[nregex->hash % REGEX_HASHMAP_SIZE] = nregex->next; | |
if (nregex->next != NULL) { | |
nregex->next->prev = NULL; | |
} | |
} | |
free(nregex->regex_str); | |
free(nregex); | |
return TRUE; | |
} | |
/* Same as regexec, but uses strstr if the regex is trivial. */ | |
int nano_regexec(const nregex_t *restrict nregex, const char *restrict str, | |
size_t nmatch, regmatch_t pmatch[], int eflags) | |
{ | |
/* If we have a simple regex string, we can just use strstr or | |
* strcasestr. */ | |
if (nregex->trivial) { | |
const char *match = (nregex->flags & REG_ICASE) | |
? strcasestr(str, nregex->regex_str) | |
: strstr(str, nregex->regex_str); | |
if (pmatch != NULL) { | |
for (int i = 0; i < nmatch; i++) { | |
pmatch[i].rm_so = (match == NULL) ? -1 : ((regoff_t)match - (regoff_t)str); | |
pmatch[i].rm_eo = (match == NULL) ? -1 : (pmatch[i].rm_so + (regoff_t)strlen(nregex->regex_str)); | |
} | |
} | |
if (match != NULL) { | |
return 0; | |
} else { | |
return REG_NOMATCH; | |
} | |
} else { | |
return regexec(&nregex->regex, str, nmatch, pmatch, eflags); | |
} | |
} | |
int main(void) | |
{ | |
nregex_t *regex = nano_regcomp("foo*", REG_ICASE); | |
nregex_t *regex2 = nano_regcomp("foo*", REG_ICASE); | |
nregex_t *regex3 = nano_regcomp("foo*", REG_ICASE); | |
nregex_t *regex4 = nano_regcomp("foo*", 0); | |
/* A bad regex expression */ | |
nregex_t *regex7; | |
if (regex != regex2 || regex2 != regex3 || regex3 != regex) | |
puts("regex, regex2, and regex3 are not equal. fail"); | |
if (regex3 == regex4) | |
puts("collision detection is failing"); | |
printf("matching \"f\" with \"foo*\": %s\n", nano_regexec(regex, "f", 0, NULL, 0) ? "ok" : "fail"); | |
printf("matching \"FOO\" with \"foo*\" (icase): %s\n", nano_regexec(regex, "FOO", 0, NULL, 0) ? "fail" : "ok"); | |
printf("matching \"FOO\" with \"foo*\": %s\n", nano_regexec(regex4, "FOO", 0, NULL, 0) ? "ok" : "fail"); | |
if (nano_regfree(regex)) | |
puts("freed a regex that has references"); | |
nano_regfree(regex2); | |
nano_regfree(regex3); | |
nano_regfree(regex4); | |
/* Known FNV-1a collisions, and also trivial strings. */ | |
nregex_t *regex5 = nano_regcomp("costarring", 0); | |
nregex_t *regex6 = nano_regcomp("liquid", 0); | |
/* These generate collisions, but should not be equal. */ | |
if (regex5 == regex6) | |
puts("collision detection is failing. "); | |
if (!regex5->trivial || !regex6->trivial) | |
puts("not trivial"); | |
nano_regfree(regex5); | |
puts("the following should fail:"); | |
regex7 = nano_regcomp("[", 0); | |
if (regex7 != NULL) | |
puts("failed, regex7 is not null"); | |
if (nano_regfree(regex7)) | |
puts("freeing a null nregex_t returned true"); | |
printf("matching \"LIQUID\" with \"liquid\": %s\n", nano_regexec(regex6, "LIQUID", 0, NULL, 0) ? "ok" : "fail"); | |
nano_regfree(regex6); | |
/* Adapted from https://www.ibm.com/support/knowledgecenter/en/ssw_ibm_i_72/rtref/regexec.htm */ | |
const char *string = "a very simple simple simple string"; | |
nregex_t *regex8 = nano_regcomp("(sim[a-z]le)", REG_EXTENDED); | |
regmatch_t pmatch[1]; | |
if (regex8 == NULL) { | |
puts("regex8 was null"); | |
} else if (nano_regexec(regex8, string, 1, pmatch, 0) == 0) { | |
printf("matching substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n", | |
(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so], | |
(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1); | |
} else { | |
printf("regex8 failed to match"); | |
} | |
nregex_t *regex9 = nano_regcomp("simple", REG_EXTENDED); | |
if (regex9 == NULL || !regex9->trivial) { | |
puts("regex9 failed to compile properly"); | |
} else if (nano_regexec(regex9, string, 1, pmatch, 0) == 0) { | |
printf("matching trivial substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n", | |
(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so], | |
(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1); | |
} else { | |
puts("regex9 failed to find a match"); | |
} | |
nano_regfree(regex8); | |
nano_regfree(regex9); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment