Skip to content

Instantly share code, notes, and snippets.

@easyaspi314
Last active October 31, 2018 15:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save easyaspi314/07e667636fde11216dae190f03d362c6 to your computer and use it in GitHub Desktop.
Save easyaspi314/07e667636fde11216dae190f03d362c6 to your computer and use it in GitHub Desktop.
nano regex_hash.c demo
/**************************************************************************
* regex_hash.c -- This file is part of GNU nano. *
* *
* Copyright (C) 2018 Free Software Foundation, Inc. *
* Copyright (C) 2018 Benno Schulenberg *
* *
* GNU nano is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published *
* by the Free Software Foundation, either version 3 of the License, *
* or (at your option) any later version. *
* *
* GNU nano is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty *
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see http://www.gnu.org/licenses/. *
* *
**************************************************************************/
/* Undefine this to remove the debug logging. */
#define DEBUG 1
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <regex.h>
#ifdef DEBUG
#include <assert.h>
#endif
/* Wrap stuff that would be in nano's code to make this standalone. */
/* #include "proto.h" */
#define TRUE true
#define FALSE false
#define nmalloc malloc
#define mallocstrcpy(x, y) strdup((y))
#define statusline(x, y, ...) fprintf(stderr, y, ##__VA_ARGS__), putc('\n', stderr)
#define fixbounds(x) (x)
#define N_(x) (x)
/* A regex_t wrapper. We store nregex_t in a hash map to avoid endless regex
* recompilations. */
typedef struct nregex_type {
regex_t regex;
/* The compiled regex itself. */
char *regex_str;
/* The regex string. This is strcmp'd to check for collisions. */
struct nregex_type *next;
/* Next in the list, in case we have collisions. */
struct nregex_type *prev;
/* Do we need this? */
int refcount;
/* A ref counter to tell when we can free this regex_t. Not sure
* whether to keep it right now. */
uint32_t hash;
/* The actual hash */
int flags;
/* The regex flags */
bool initialized;
/* Whether the regex was initialized or not */
bool trivial;
/* Whether this is a trivial string and we don't have to waste our
* time with actual regex. */
} nregex_t;
/* Must be a power of 2 for performance! */
#define REGEX_HASHMAP_SIZE 63u
static nregex_t *regex_hashmap[REGEX_HASHMAP_SIZE];
/* Finds an nregex_t from the hash map and returns it.
* If there is no match, either a new nregex_t is created (create == TRUE),
* or a null pointer is returned. */
static nregex_t *find_regex(const char *restrict regex_str, int flags, bool create)
{
nregex_t *prev = NULL;
nregex_t *nregex;
/* FNV-1a hashing algorithm. */
uint32_t hash = 2166136261u;
{
const uint8_t *restrict data = (const uint8_t *)regex_str;
while (*data)
hash = (hash ^ *data++) * 16777619u;
}
/* Go through the matches. We have to check because FNV-1a is not perfect,
* and we also only have so many unique keys. */
nregex = regex_hashmap[hash % REGEX_HASHMAP_SIZE];
for (; nregex != NULL; nregex = nregex->next) {
/* Check if we already have an identical regex, and return it if so. */
if (nregex->regex_str != NULL
&& nregex->hash == hash
&& nregex->flags == flags
&& strcmp(nregex->regex_str, regex_str) == 0) {
return nregex;
}
#ifdef DEBUG
fprintf(stderr, "\"%s\" (flags %d, hash 0x%x) collides with \"%s\" (flags %d, hash 0x%x)\n",
regex_str, flags, hash, nregex->regex_str, nregex->flags, nregex->hash);
#endif
/* Collision or same regex with different flags. Store nregex as the
* last one so we can potentially link it in to the hash list. */
prev = nregex;
}
#ifdef DEBUG
assert(nregex == NULL);
#endif
/* Create a new nregex_t if it doesn't exist. */
if (create) {
nregex = (nregex_t *)nmalloc(sizeof(nregex_t));
nregex->regex_str = mallocstrcpy(NULL, regex_str);
if (prev != NULL) {
/* Link it with the previous */
prev->next = nregex;
nregex->prev = prev;
} else {
/* Put it at the top of the list */
nregex->prev = NULL;
regex_hashmap[hash % REGEX_HASHMAP_SIZE] = nregex;
}
nregex->next = NULL;
nregex->refcount = 0;
nregex->hash = hash;
nregex->flags = flags;
nregex->initialized = FALSE;
/* Check for regex metacharacters. We don't bother checking for escapes,
* as checking for proper regex...needs regex, and we also have to
* modify the string in order to remove double backslashes. */
nregex->trivial = (strpbrk(regex_str, "[](){}.*+?^\\|") == NULL);
#ifdef DEBUG
if (nregex->trivial)
fprintf(stderr, "Creating trivial regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
else
fprintf(stderr, "Creating regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
#endif
return nregex;
}
return NULL;
}
/* Returns a nregex_t generated from the arguments. Either this will return
* a previously generated nregex_t, or it will create a new one.
* On an error, the error message is statuslined, and this will return NULL. */
nregex_t *nano_regcomp(const char *restrict regex_str_raw, int flags)
{
/* Fix word boundaries first */
const char *regex_str = fixbounds(regex_str_raw);
/* Find the regex we need */
nregex_t *restrict nregex = find_regex(regex_str, flags, TRUE);
/* Inrease the refcount */
nregex->refcount++;
/* Only compile the regex if the string is not trivial and it hasn't
* already been compiled. */
if (!nregex->trivial && !nregex->initialized) {
int rc = regcomp(&nregex->regex, regex_str, flags);
/* Error */
if (rc != 0) {
/* 128 bytes should be more than enough. */
char buf[128];
size_t len = regerror(rc, &nregex->regex, buf, 128);
statusline(ALERT, N_("Bad regex \"%s\": %s"), regex_str, buf);
free(nregex->regex_str);
free(nregex);
return NULL;
}
}
return nregex;
}
/* Frees a nregex_t if there are no more references to it.
* Returns TRUE if the nregex was actually freed. */
bool nano_regfree(nregex_t *restrict nregex)
{
/* We don't do anything if nregex is NULL or if we still have refs. */
if (nregex == NULL || --(nregex->refcount) > 0)
return FALSE;
#ifdef DEBUG
fprintf(stderr, "Freeing regex \"%s\", flags %d, hash 0x%x\n", nregex->regex_str, nregex->flags, nregex->hash);
#endif
if (!nregex->trivial && nregex->initialized) {
regfree(&nregex->regex);
}
/* We have a parent */
if (nregex->prev != NULL) {
nregex->prev->next = nregex->next;
if (nregex->next != NULL) {
nregex->next->prev = nregex->prev;
}
} else {
regex_hashmap[nregex->hash % REGEX_HASHMAP_SIZE] = nregex->next;
if (nregex->next != NULL) {
nregex->next->prev = NULL;
}
}
free(nregex->regex_str);
free(nregex);
return TRUE;
}
/* Same as regexec, but uses strstr if the regex is trivial. */
int nano_regexec(const nregex_t *restrict nregex, const char *restrict str,
size_t nmatch, regmatch_t pmatch[], int eflags)
{
/* If we have a simple regex string, we can just use strstr or
* strcasestr. */
if (nregex->trivial) {
const char *match = (nregex->flags & REG_ICASE)
? strcasestr(str, nregex->regex_str)
: strstr(str, nregex->regex_str);
if (pmatch != NULL) {
for (int i = 0; i < nmatch; i++) {
pmatch[i].rm_so = (match == NULL) ? -1 : ((regoff_t)match - (regoff_t)str);
pmatch[i].rm_eo = (match == NULL) ? -1 : (pmatch[i].rm_so + (regoff_t)strlen(nregex->regex_str));
}
}
if (match != NULL) {
return 0;
} else {
return REG_NOMATCH;
}
} else {
return regexec(&nregex->regex, str, nmatch, pmatch, eflags);
}
}
int main(void)
{
nregex_t *regex = nano_regcomp("foo*", REG_ICASE);
nregex_t *regex2 = nano_regcomp("foo*", REG_ICASE);
nregex_t *regex3 = nano_regcomp("foo*", REG_ICASE);
nregex_t *regex4 = nano_regcomp("foo*", 0);
/* A bad regex expression */
nregex_t *regex7;
if (regex != regex2 || regex2 != regex3 || regex3 != regex)
puts("regex, regex2, and regex3 are not equal. fail");
if (regex3 == regex4)
puts("collision detection is failing");
printf("matching \"f\" with \"foo*\": %s\n", nano_regexec(regex, "f", 0, NULL, 0) ? "ok" : "fail");
printf("matching \"FOO\" with \"foo*\" (icase): %s\n", nano_regexec(regex, "FOO", 0, NULL, 0) ? "fail" : "ok");
printf("matching \"FOO\" with \"foo*\": %s\n", nano_regexec(regex4, "FOO", 0, NULL, 0) ? "ok" : "fail");
if (nano_regfree(regex))
puts("freed a regex that has references");
nano_regfree(regex2);
nano_regfree(regex3);
nano_regfree(regex4);
/* Known FNV-1a collisions, and also trivial strings. */
nregex_t *regex5 = nano_regcomp("costarring", 0);
nregex_t *regex6 = nano_regcomp("liquid", 0);
/* These generate collisions, but should not be equal. */
if (regex5 == regex6)
puts("collision detection is failing. ");
if (!regex5->trivial || !regex6->trivial)
puts("not trivial");
nano_regfree(regex5);
puts("the following should fail:");
regex7 = nano_regcomp("[", 0);
if (regex7 != NULL)
puts("failed, regex7 is not null");
if (nano_regfree(regex7))
puts("freeing a null nregex_t returned true");
printf("matching \"LIQUID\" with \"liquid\": %s\n", nano_regexec(regex6, "LIQUID", 0, NULL, 0) ? "ok" : "fail");
nano_regfree(regex6);
/* Adapted from https://www.ibm.com/support/knowledgecenter/en/ssw_ibm_i_72/rtref/regexec.htm */
const char *string = "a very simple simple simple string";
nregex_t *regex8 = nano_regcomp("(sim[a-z]le)", REG_EXTENDED);
regmatch_t pmatch[1];
if (regex8 == NULL) {
puts("regex8 was null");
} else if (nano_regexec(regex8, string, 1, pmatch, 0) == 0) {
printf("matching substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
} else {
printf("regex8 failed to match");
}
nregex_t *regex9 = nano_regcomp("simple", REG_EXTENDED);
if (regex9 == NULL || !regex9->trivial) {
puts("regex9 failed to compile properly");
} else if (nano_regexec(regex9, string, 1, pmatch, 0) == 0) {
printf("matching trivial substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
} else {
puts("regex9 failed to find a match");
}
nano_regfree(regex8);
nano_regfree(regex9);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment