easyaspi314/regex_hash.c Secret

## regex_hash.c
/**************************************************************************
 *   regex_hash.c  --  This file is part of GNU nano.                     *
 *                                                                        *
 *   Copyright (C) 2018 Free Software Foundation, Inc.                    *
 *   Copyright (C) 2018 Benno Schulenberg                                 *
 *                                                                        *
 *   GNU nano is free software: you can redistribute it and/or modify     *
 *   it under the terms of the GNU General Public License as published    *
 *   by the Free Software Foundation, either version 3 of the License,    *
 *   or (at your option) any later version.                               *
 *                                                                        *
 *   GNU nano is distributed in the hope that it will be useful,          *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty          *
 *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.              *
 *   See the GNU General Public License for more details.                 *
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program.  If not, see http://www.gnu.org/licenses/.  *
 *                                                                        *
 **************************************************************************/
/* Undefine this to remove the debug logging. */
#define DEBUG 1

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <regex.h>
#ifdef DEBUG
#include <assert.h>
#endif

/* Wrap stuff that would be in nano's code to make this standalone. */

/* #include "proto.h" */
#define TRUE true
#define FALSE false
#define nmalloc malloc
#define mallocstrcpy(x, y) strdup((y))
#define statusline(x, y, ...) fprintf(stderr, y, ##__VA_ARGS__), putc('\n', stderr)
#define fixbounds(x) (x)
#define N_(x) (x)

/* A regex_t wrapper. We store nregex_t in a hash map to avoid endless regex
 * recompilations. */
typedef struct nregex_type {
	regex_t regex;
		/* The compiled regex itself. */
	char *regex_str;
		/* The regex string. This is strcmp'd to check for collisions. */
	struct nregex_type *next;
		/* Next in the list, in case we have collisions. */
	struct nregex_type *prev;
		/* Do we need this? */
	int refcount;
		/* A ref counter to tell when we can free this regex_t. Not sure
		 * whether to keep it right now. */
	uint32_t hash;
		/* The actual hash */
	int flags;
		/* The regex flags */
	bool initialized;
		/* Whether the regex was initialized or not */
	bool trivial;
		/* Whether this is a trivial string and we don't have to waste our
		 * time with actual regex. */
} nregex_t;

/* Must be a power of 2 for performance! */
#define REGEX_HASHMAP_SIZE 63u
static nregex_t *regex_hashmap[REGEX_HASHMAP_SIZE];

/* Finds an nregex_t from the hash map and returns it.
 * If there is no match, either a new nregex_t is created (create == TRUE),
 * or a null pointer is returned. */
static nregex_t *find_regex(const char *restrict regex_str, int flags, bool create)
{
	nregex_t *prev = NULL;
	nregex_t *nregex;

	/* FNV-1a hashing algorithm. */
	uint32_t hash = 2166136261u;
	{
		const uint8_t *restrict data = (const uint8_t *)regex_str;

		while (*data)
			hash = (hash ^ *data++) * 16777619u;
	}

	/* Go through the matches. We have to check because FNV-1a is not perfect,
	 * and we also only have so many unique keys. */
	nregex = regex_hashmap[hash % REGEX_HASHMAP_SIZE];

	for (; nregex != NULL; nregex = nregex->next) {
		/* Check if we already have an identical regex, and return it if so. */
		if (nregex->regex_str != NULL
			&& nregex->hash == hash
			&& nregex->flags == flags
			&& strcmp(nregex->regex_str, regex_str) == 0) {
			return nregex;
		}
#ifdef DEBUG
		fprintf(stderr, "\"%s\" (flags %d, hash 0x%x) collides with \"%s\" (flags %d, hash 0x%x)\n",
			regex_str, flags, hash, nregex->regex_str, nregex->flags, nregex->hash);
#endif
		/* Collision or same regex with different flags. Store nregex as the
		 * last one so we can potentially link it in to the hash list. */
		prev = nregex;
	}

#ifdef DEBUG
	assert(nregex == NULL);
#endif

	/* Create a new nregex_t if it doesn't exist. */
	if (create) {
		nregex = (nregex_t *)nmalloc(sizeof(nregex_t));
		nregex->regex_str = mallocstrcpy(NULL, regex_str);
		if (prev != NULL) {
			/* Link it with the previous */
			prev->next = nregex;
			nregex->prev = prev;
		} else {
			/* Put it at the top of the list */
			nregex->prev = NULL;
			regex_hashmap[hash % REGEX_HASHMAP_SIZE] = nregex;
		}
		nregex->next = NULL;
		nregex->refcount = 0;
		nregex->hash = hash;
		nregex->flags = flags;
		nregex->initialized = FALSE;

		/* Check for regex metacharacters. We don't bother checking for escapes,
		 * as checking for proper regex...needs regex, and we also have to
		 * modify the string in order to remove double backslashes. */
		nregex->trivial = (strpbrk(regex_str, "[](){}.*+?^\\|") == NULL);
#ifdef DEBUG
		if (nregex->trivial)
			fprintf(stderr, "Creating trivial regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
		else
			fprintf(stderr, "Creating regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
#endif
		return nregex;
	}
	return NULL;
}

/* Returns a nregex_t generated from the arguments. Either this will return
 * a previously generated nregex_t, or it will create a new one.
 * On an error, the error message is statuslined, and this will return NULL. */
nregex_t *nano_regcomp(const char *restrict regex_str_raw, int flags)
{
	/* Fix word boundaries first */
	const char *regex_str = fixbounds(regex_str_raw);
	/* Find the regex we need */
	nregex_t *restrict nregex = find_regex(regex_str, flags, TRUE);

	/* Inrease the refcount */
	nregex->refcount++;

	/* Only compile the regex if the string is not trivial and it hasn't
	 * already been compiled. */
	if (!nregex->trivial && !nregex->initialized) {
		int rc = regcomp(&nregex->regex, regex_str, flags);

		/* Error */
		if (rc != 0) {
			/* 128 bytes should be more than enough. */
			char buf[128];
			size_t len = regerror(rc, &nregex->regex, buf, 128);

			statusline(ALERT, N_("Bad regex \"%s\": %s"), regex_str, buf);

			free(nregex->regex_str);
			free(nregex);

			return NULL;
		}
	}
	return nregex;
}

/* Frees a nregex_t if there are no more references to it.
 * Returns TRUE if the nregex was actually freed. */
bool nano_regfree(nregex_t *restrict nregex)
{
	/* We don't do anything if nregex is NULL or if we still have refs. */
	if (nregex == NULL || --(nregex->refcount) > 0)
		return FALSE;
#ifdef DEBUG
	fprintf(stderr, "Freeing regex \"%s\", flags %d, hash 0x%x\n", nregex->regex_str, nregex->flags, nregex->hash);
#endif
	if (!nregex->trivial && nregex->initialized) {
		regfree(&nregex->regex);
	}
	/* We have a parent */
	if (nregex->prev != NULL) {
		nregex->prev->next = nregex->next;
		if (nregex->next != NULL) {
			nregex->next->prev = nregex->prev;
		}
	} else {
		regex_hashmap[nregex->hash % REGEX_HASHMAP_SIZE] = nregex->next;
		if (nregex->next != NULL) {
			nregex->next->prev = NULL;
		}
	}
	free(nregex->regex_str);
	free(nregex);
	return TRUE;
}

/* Same as regexec, but uses strstr if the regex is trivial. */
int nano_regexec(const nregex_t *restrict nregex, const char *restrict str,
	size_t nmatch, regmatch_t pmatch[], int eflags)
{
	/* If we have a simple regex string, we can just use strstr or
	 * strcasestr. */
	if (nregex->trivial) {
		const char *match = (nregex->flags & REG_ICASE)
			? strcasestr(str, nregex->regex_str)
			: strstr(str, nregex->regex_str);
		if (pmatch != NULL) {
			for (int i = 0; i < nmatch; i++) {
				pmatch[i].rm_so = (match == NULL) ? -1 : ((regoff_t)match - (regoff_t)str);
				pmatch[i].rm_eo = (match == NULL) ? -1 : (pmatch[i].rm_so + (regoff_t)strlen(nregex->regex_str));
			}
		}
		if (match != NULL) {
			return 0;
		} else {
			return REG_NOMATCH;
		}
	} else {
		return regexec(&nregex->regex, str, nmatch, pmatch, eflags);
	}
}
int main(void)
{
	nregex_t *regex = nano_regcomp("foo*", REG_ICASE);
	nregex_t *regex2 = nano_regcomp("foo*", REG_ICASE);
	nregex_t *regex3 = nano_regcomp("foo*", REG_ICASE);
	nregex_t *regex4 = nano_regcomp("foo*", 0);
	/* A bad regex expression */
	nregex_t *regex7;

	if (regex != regex2 || regex2 != regex3 || regex3 != regex)
		puts("regex, regex2, and regex3 are not equal. fail");
	if (regex3 == regex4)
		puts("collision detection is failing");

	printf("matching \"f\" with \"foo*\": %s\n", nano_regexec(regex, "f", 0, NULL, 0) ? "ok" : "fail");
	printf("matching \"FOO\" with \"foo*\" (icase): %s\n", nano_regexec(regex, "FOO", 0, NULL, 0) ? "fail" : "ok");
	printf("matching \"FOO\" with \"foo*\": %s\n", nano_regexec(regex4, "FOO", 0, NULL, 0) ? "ok" : "fail");

	if (nano_regfree(regex))
		puts("freed a regex that has references");
	nano_regfree(regex2);
	nano_regfree(regex3);
	nano_regfree(regex4);


	/* Known FNV-1a collisions, and also trivial strings. */
	nregex_t *regex5 = nano_regcomp("costarring", 0);
	nregex_t *regex6 = nano_regcomp("liquid", 0);
	/* These generate collisions, but should not be equal. */
	if (regex5 == regex6)
		puts("collision detection is failing. ");
	if (!regex5->trivial || !regex6->trivial)
		puts("not trivial");


	nano_regfree(regex5);

	puts("the following should fail:");
	regex7 = nano_regcomp("[", 0);
	if (regex7 != NULL)
		puts("failed, regex7 is not null");

	if (nano_regfree(regex7))
		puts("freeing a null nregex_t returned true");

	printf("matching \"LIQUID\" with \"liquid\": %s\n", nano_regexec(regex6, "LIQUID", 0, NULL, 0) ? "ok" : "fail");
	nano_regfree(regex6);

	/* Adapted from https://www.ibm.com/support/knowledgecenter/en/ssw_ibm_i_72/rtref/regexec.htm */
	const char *string = "a very simple simple simple string";
	nregex_t *regex8 = nano_regcomp("(sim[a-z]le)", REG_EXTENDED);
	regmatch_t pmatch[1];

	if (regex8 == NULL) {
		puts("regex8 was null");
	} else if (nano_regexec(regex8, string, 1, pmatch, 0) == 0) {
		printf("matching substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
			(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
			(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
	} else {
		printf("regex8 failed to match");
	}

	nregex_t *regex9 = nano_regcomp("simple", REG_EXTENDED);
	if (regex9 == NULL || !regex9->trivial) {
		puts("regex9 failed to compile properly");
	} else if (nano_regexec(regex9, string, 1, pmatch, 0) == 0) {
		printf("matching trivial substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
			(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
			(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
	} else {
		puts("regex9 failed to find a match");
	}

	nano_regfree(regex8);
	nano_regfree(regex9);
	return 0;
}
	/**************************************************************************
	* regex_hash.c -- This file is part of GNU nano. *
	* *
	* Copyright (C) 2018 Free Software Foundation, Inc. *
	* Copyright (C) 2018 Benno Schulenberg *
	* *
	* GNU nano is free software: you can redistribute it and/or modify *
	* it under the terms of the GNU General Public License as published *
	* by the Free Software Foundation, either version 3 of the License, *
	* or (at your option) any later version. *
	* *
	* GNU nano is distributed in the hope that it will be useful, *
	* but WITHOUT ANY WARRANTY; without even the implied warranty *
	* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
	* See the GNU General Public License for more details. *
	* *
	* You should have received a copy of the GNU General Public License *
	* along with this program. If not, see http://www.gnu.org/licenses/. *
	* *
	**************************************************************************/
	/* Undefine this to remove the debug logging. */
	#define DEBUG 1

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdbool.h>
	#include <string.h>
	#include <regex.h>
	#ifdef DEBUG
	#include <assert.h>
	#endif

	/* Wrap stuff that would be in nano's code to make this standalone. */

	/* #include "proto.h" */
	#define TRUE true
	#define FALSE false
	#define nmalloc malloc
	#define mallocstrcpy(x, y) strdup((y))
	#define statusline(x, y, ...) fprintf(stderr, y, ##__VA_ARGS__), putc('\n', stderr)
	#define fixbounds(x) (x)
	#define N_(x) (x)

	/* A regex_t wrapper. We store nregex_t in a hash map to avoid endless regex
	* recompilations. */
	typedef struct nregex_type {
	regex_t regex;
	/* The compiled regex itself. */
	char *regex_str;
	/* The regex string. This is strcmp'd to check for collisions. */
	struct nregex_type *next;
	/* Next in the list, in case we have collisions. */
	struct nregex_type *prev;
	/* Do we need this? */
	int refcount;
	/* A ref counter to tell when we can free this regex_t. Not sure
	* whether to keep it right now. */
	uint32_t hash;
	/* The actual hash */
	int flags;
	/* The regex flags */
	bool initialized;
	/* Whether the regex was initialized or not */
	bool trivial;
	/* Whether this is a trivial string and we don't have to waste our
	* time with actual regex. */
	} nregex_t;

	/* Must be a power of 2 for performance! */
	#define REGEX_HASHMAP_SIZE 63u
	static nregex_t *regex_hashmap[REGEX_HASHMAP_SIZE];

	/* Finds an nregex_t from the hash map and returns it.
	* If there is no match, either a new nregex_t is created (create == TRUE),
	* or a null pointer is returned. */
	static nregex_t find_regex(const char restrict regex_str, int flags, bool create)
	{
	nregex_t *prev = NULL;
	nregex_t *nregex;

	/* FNV-1a hashing algorithm. */
	uint32_t hash = 2166136261u;
	{
	const uint8_t restrict data = (const uint8_t )regex_str;

	while (*data)
	hash = (hash ^ data++) 16777619u;
	}

	/* Go through the matches. We have to check because FNV-1a is not perfect,
	* and we also only have so many unique keys. */
	nregex = regex_hashmap[hash % REGEX_HASHMAP_SIZE];

	for (; nregex != NULL; nregex = nregex->next) {
	/* Check if we already have an identical regex, and return it if so. */
	if (nregex->regex_str != NULL
	&& nregex->hash == hash
	&& nregex->flags == flags
	&& strcmp(nregex->regex_str, regex_str) == 0) {
	return nregex;
	}
	#ifdef DEBUG
	fprintf(stderr, "\"%s\" (flags %d, hash 0x%x) collides with \"%s\" (flags %d, hash 0x%x)\n",
	regex_str, flags, hash, nregex->regex_str, nregex->flags, nregex->hash);
	#endif
	/* Collision or same regex with different flags. Store nregex as the
	* last one so we can potentially link it in to the hash list. */
	prev = nregex;
	}

	#ifdef DEBUG
	assert(nregex == NULL);
	#endif

	/* Create a new nregex_t if it doesn't exist. */
	if (create) {
	nregex = (nregex_t *)nmalloc(sizeof(nregex_t));
	nregex->regex_str = mallocstrcpy(NULL, regex_str);
	if (prev != NULL) {
	/* Link it with the previous */
	prev->next = nregex;
	nregex->prev = prev;
	} else {
	/* Put it at the top of the list */
	nregex->prev = NULL;
	regex_hashmap[hash % REGEX_HASHMAP_SIZE] = nregex;
	}
	nregex->next = NULL;
	nregex->refcount = 0;
	nregex->hash = hash;
	nregex->flags = flags;
	nregex->initialized = FALSE;

	/* Check for regex metacharacters. We don't bother checking for escapes,
	* as checking for proper regex...needs regex, and we also have to
	* modify the string in order to remove double backslashes. */
	nregex->trivial = (strpbrk(regex_str, "[](){}.*+?^\\\|") == NULL);
	#ifdef DEBUG
	if (nregex->trivial)
	fprintf(stderr, "Creating trivial regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
	else
	fprintf(stderr, "Creating regex_t: regex_str: \"%s\", hash: 0x%x, flags: %d\n", regex_str, hash, flags);
	#endif
	return nregex;
	}
	return NULL;
	}

	/* Returns a nregex_t generated from the arguments. Either this will return
	* a previously generated nregex_t, or it will create a new one.
	* On an error, the error message is statuslined, and this will return NULL. */
	nregex_t nano_regcomp(const char restrict regex_str_raw, int flags)
	{
	/* Fix word boundaries first */
	const char *regex_str = fixbounds(regex_str_raw);
	/* Find the regex we need */
	nregex_t *restrict nregex = find_regex(regex_str, flags, TRUE);

	/* Inrease the refcount */
	nregex->refcount++;

	/* Only compile the regex if the string is not trivial and it hasn't
	* already been compiled. */
	if (!nregex->trivial && !nregex->initialized) {
	int rc = regcomp(&nregex->regex, regex_str, flags);

	/* Error */
	if (rc != 0) {
	/* 128 bytes should be more than enough. */
	char buf[128];
	size_t len = regerror(rc, &nregex->regex, buf, 128);

	statusline(ALERT, N_("Bad regex \"%s\": %s"), regex_str, buf);

	free(nregex->regex_str);
	free(nregex);

	return NULL;
	}
	}
	return nregex;
	}

	/* Frees a nregex_t if there are no more references to it.
	* Returns TRUE if the nregex was actually freed. */
	bool nano_regfree(nregex_t *restrict nregex)
	{
	/* We don't do anything if nregex is NULL or if we still have refs. */
	if (nregex == NULL \|\| --(nregex->refcount) > 0)
	return FALSE;
	#ifdef DEBUG
	fprintf(stderr, "Freeing regex \"%s\", flags %d, hash 0x%x\n", nregex->regex_str, nregex->flags, nregex->hash);
	#endif
	if (!nregex->trivial && nregex->initialized) {
	regfree(&nregex->regex);
	}
	/* We have a parent */
	if (nregex->prev != NULL) {
	nregex->prev->next = nregex->next;
	if (nregex->next != NULL) {
	nregex->next->prev = nregex->prev;
	}
	} else {
	regex_hashmap[nregex->hash % REGEX_HASHMAP_SIZE] = nregex->next;
	if (nregex->next != NULL) {
	nregex->next->prev = NULL;
	}
	}
	free(nregex->regex_str);
	free(nregex);
	return TRUE;
	}

	/* Same as regexec, but uses strstr if the regex is trivial. */
	int nano_regexec(const nregex_t restrict nregex, const char restrict str,
	size_t nmatch, regmatch_t pmatch[], int eflags)
	{
	/* If we have a simple regex string, we can just use strstr or
	* strcasestr. */
	if (nregex->trivial) {
	const char *match = (nregex->flags & REG_ICASE)
	? strcasestr(str, nregex->regex_str)
	: strstr(str, nregex->regex_str);
	if (pmatch != NULL) {
	for (int i = 0; i < nmatch; i++) {
	pmatch[i].rm_so = (match == NULL) ? -1 : ((regoff_t)match - (regoff_t)str);
	pmatch[i].rm_eo = (match == NULL) ? -1 : (pmatch[i].rm_so + (regoff_t)strlen(nregex->regex_str));
	}
	}
	if (match != NULL) {
	return 0;
	} else {
	return REG_NOMATCH;
	}
	} else {
	return regexec(&nregex->regex, str, nmatch, pmatch, eflags);
	}
	}
	int main(void)
	{
	nregex_t regex = nano_regcomp("foo", REG_ICASE);
	nregex_t regex2 = nano_regcomp("foo", REG_ICASE);
	nregex_t regex3 = nano_regcomp("foo", REG_ICASE);
	nregex_t regex4 = nano_regcomp("foo", 0);
	/* A bad regex expression */
	nregex_t *regex7;

	if (regex != regex2 \|\| regex2 != regex3 \|\| regex3 != regex)
	puts("regex, regex2, and regex3 are not equal. fail");
	if (regex3 == regex4)
	puts("collision detection is failing");

	printf("matching \"f\" with \"foo*\": %s\n", nano_regexec(regex, "f", 0, NULL, 0) ? "ok" : "fail");
	printf("matching \"FOO\" with \"foo*\" (icase): %s\n", nano_regexec(regex, "FOO", 0, NULL, 0) ? "fail" : "ok");
	printf("matching \"FOO\" with \"foo*\": %s\n", nano_regexec(regex4, "FOO", 0, NULL, 0) ? "ok" : "fail");

	if (nano_regfree(regex))
	puts("freed a regex that has references");
	nano_regfree(regex2);
	nano_regfree(regex3);
	nano_regfree(regex4);


	/* Known FNV-1a collisions, and also trivial strings. */
	nregex_t *regex5 = nano_regcomp("costarring", 0);
	nregex_t *regex6 = nano_regcomp("liquid", 0);
	/* These generate collisions, but should not be equal. */
	if (regex5 == regex6)
	puts("collision detection is failing. ");
	if (!regex5->trivial \|\| !regex6->trivial)
	puts("not trivial");


	nano_regfree(regex5);

	puts("the following should fail:");
	regex7 = nano_regcomp("[", 0);
	if (regex7 != NULL)
	puts("failed, regex7 is not null");

	if (nano_regfree(regex7))
	puts("freeing a null nregex_t returned true");

	printf("matching \"LIQUID\" with \"liquid\": %s\n", nano_regexec(regex6, "LIQUID", 0, NULL, 0) ? "ok" : "fail");
	nano_regfree(regex6);

	/* Adapted from https://www.ibm.com/support/knowledgecenter/en/ssw_ibm_i_72/rtref/regexec.htm */
	const char *string = "a very simple simple simple string";
	nregex_t *regex8 = nano_regcomp("(sim[a-z]le)", REG_EXTENDED);
	regmatch_t pmatch[1];

	if (regex8 == NULL) {
	puts("regex8 was null");
	} else if (nano_regexec(regex8, string, 1, pmatch, 0) == 0) {
	printf("matching substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
	(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
	(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
	} else {
	printf("regex8 failed to match");
	}

	nregex_t *regex9 = nano_regcomp("simple", REG_EXTENDED);
	if (regex9 == NULL \|\| !regex9->trivial) {
	puts("regex9 failed to compile properly");
	} else if (nano_regexec(regex9, string, 1, pmatch, 0) == 0) {
	printf("matching trivial substring \"%.*s\" is found at position %d to %d. should be \"simple\", range 7-12\n",
	(int)pmatch[0].rm_eo - (int)pmatch[0].rm_so, &string[pmatch[0].rm_so],
	(int)pmatch[0].rm_so, (int)pmatch[0].rm_eo - 1);
	} else {
	puts("regex9 failed to find a match");
	}

	nano_regfree(regex8);
	nano_regfree(regex9);
	return 0;
	}