Last active
June 12, 2023 21:00
-
-
Save angstyloop/fdd9884b2a4dec8a730e6a697869928c to your computer and use it in GitHub Desktop.
Search for fuzzy matches of a pattern in text with @k or fewer mismatches (substitutions) in C. The search and preprocess functions were taken from the example code from "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates and Chris H. Perleberg. This version is very optimized, and has a fixed maximum pattern length. For a …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* match_with_mismatches.c | |
* | |
* Search for fuzzy matches of a pattern in text with @k or fewer mismatches | |
* (substitutions). | |
* | |
* COMPILE | |
* | |
* gcc -o match_with_mismatches match_with_mismatches.c | |
* | |
* RUN | |
* | |
* ./match_with_mismatches | |
* | |
* REFS | |
* | |
* The search and preprocess functions were taken from the example code from | |
* "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates | |
* and Chris H. Perleberg. I just modified the code to produce a linked list of | |
* match structs, and added more detailed comments, as well as a simple main() | |
* function that uses the functions. | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <assert.h> | |
/* Size of alpha index and count array. This number should be twice the size | |
* of the alphabet, which is 128 in this case for ASCII (not extended). | |
* The purpose of this extra space is explained later. | |
*/ | |
#define SIZE 256 | |
/* Used to mod 256 quickly instead of with the modulo operator. we can do this | |
* because 256 is a power of two, and modding by a power of two is the same as | |
*bitwise and with the same power of two. Math! | |
*/ | |
#define MOD256 0xff | |
/* The nodes of a linked list. Each Node contains an offset of a mismatched | |
* letter within a match ending at that letter, for a given letter. The offset | |
* counts starting from the last character in the match and increasing to the | |
* left. | |
* | |
* @offset - the distance of the character to the left of the end of pattern, | |
* (i.e., measured by counting to the left from the end of pattern) | |
* | |
* @next - the next Node if it exists, or NULL otherwise. | |
*/ | |
typedef struct Node Node; | |
struct Node { | |
int offset; /* distance of char from start of pattern */ | |
Node *next; /* pointer to next idxnode if it exists */ | |
}; | |
/* The Match object will hold the index @i of the first character of the match | |
* and the number of mismatched characters @k within that match. The Match | |
* objects are also linked list nodes. | |
*/ | |
typedef struct Match Match; | |
struct Match { | |
int i; | |
int k; | |
Match *next; | |
}; | |
Match * | |
Match_new( int i, int k, Match *next ) | |
{ | |
Match *t; | |
t = calloc( 1, sizeof( Match ) ); | |
t->i = i; | |
t->k = k; | |
t->next = next; | |
return t; | |
} | |
Match * | |
Match_last( Match *match ) | |
{ | |
while ( match->next ) | |
match = match->next; | |
return match; | |
} | |
Match * | |
Match_append_new( Match* match, int i, int k, Match *next ) | |
{ | |
Match *s, *t; | |
s = Match_new( i, k, next ); | |
if ( !match ) | |
return s; | |
t = Match_last( match ); | |
t->next = s; | |
return match; | |
} | |
Match * | |
Match_free( Match *match ) | |
{ | |
Match *t; | |
while ( (t = match) ) { | |
match = t->next; | |
free(t); | |
} | |
} | |
void Match_print( Match *match ) | |
{ | |
while ( match ) { | |
printf( "Match in position %d with %d mismatches.\n", | |
match->i, match->k ); | |
match = match->next; | |
} | |
} | |
int Match_length( Match *match ) | |
{ | |
int l = 0; | |
while ( match ) { | |
l++; | |
match = match->next; | |
} | |
return l; | |
} | |
/* An array of Nodes. The first 128 Nodes are the head Nodes of linked lists. | |
* This means there are 128 linked lists in total - one for each character in | |
* our 128-character alphabet. The last 128 Nodes are added to the linked | |
* lists as needed. It is more efficient on average to just allocate space for | |
* 128 Nodes statically - the most we might need - even if fewer Nodes are | |
* used, instead of dynamically allocating nodes as needed. Using a power of | |
* two also lets us quickly access the array in a cyclic manner by modding | |
* using the bitwise & operator. | |
*/ | |
Node alpha[SIZE]; | |
/* This array is used as a cyclic buffer for counts of the number of matching | |
* characters in a given instance of the pattern. The counts are maintained at | |
* the end of each pattern. When a pattern with k or fewer mismatches is found, | |
* it is reported. As the algorithm steps through the count array, it resets the | |
* counts it doesn't need anymore back to m, so they can be reused when the | |
* index in the text exceeds reaches 256 and needs to wrap around. The first m-1 | |
* characters will be initialized to SIZE, so they never have a valid number of | |
* mismatches. Note that by making this optimization we are implicitly limiting | |
* the pattern length to 256 characters. | |
*/ | |
int count[SIZE]; | |
/* This function initializes the @alpha and @count arrays to look like this: | |
* | |
* alpha = [{ offset: -1, next: Node | NULL }] * 256 | |
* | |
* count = [256] * (m - 1) + [m] * (256 - m + 1) | |
* | |
* @alpha will be a list of heads of linked lists. Characters in the pattern | |
* that do not occur or that occur exactly once in the text will have | |
* corresponding linked lists with length one. Characters in the pattern that | |
* occur in the text more than once will have corresponding linked lists with | |
* length greater than one. | |
* | |
* The first m - 1 elements of @count will be initialized to SIZE=256, so that | |
* the count never triggers a match for the first m - 1 characters. Such a match | |
* would be shorter than the pattern, which is not allowed since we are only | |
* considering mismatches (substitutions), not insertions or deletions. | |
* | |
* @p - pattern string | |
* @m - pattern length | |
* @alpha - array of Nodes. See above. | |
* @count - circular buffer for counts of matches | |
*/ | |
void preprocess( char *p, int m, Node alpha[], int count[] ) | |
{ | |
int i, j; | |
Node *node; | |
for ( i = 0; i < SIZE; i++ ) { | |
alpha[i].offset = -1; | |
alpha[i].next = NULL; | |
count[i] = m; | |
} | |
for ( i = 0, j = 128; i < m; i++, p++ ) { | |
count[i] = SIZE; | |
if ( alpha[*p].offset == -1 ) | |
alpha[*p].offset = m - i - 1; | |
else { | |
node = alpha[*p].next; | |
alpha[*p].next = &alpha[j++]; | |
alpha[*p].next->offset = m - i - 1; | |
alpha[*p].next->next = node; | |
} | |
} | |
count[m - 1] = m; | |
} | |
/* Find the position of the first character and number of mismatches of every | |
* fuzzy match in a string @t with @k or fewer mismatches. Uses the array of | |
* Nodes @alpha and the array of counts @count prepared by the preprocess | |
* function. | |
* @t - text string | |
* @n - length of text string | |
* @m - length of the pattern used to create @alpha and @count | |
* @k - maximum number of allowed mismatches | |
* @alpha - array of Nodes. See above. | |
* @count - circular buffer for counts of matches | |
*/ | |
Match * | |
search( char *t, int n, int m, int k, Node alpha[], int count[] ) | |
{ | |
int i, off1; | |
Node *node; | |
Match *match = NULL; | |
/* Walk the text @t, which has length @n. | |
*/ | |
for ( i = 0; i < n; i++ ) { | |
/* If the current character in @t is in pattern, its | |
* corresponding Node in @alpha will have a non-negative offset, | |
* thanks to the workdone by the preprocess function. If so, we | |
* need to decrement the counts in the circular buffer @count | |
* corresponding to the index of the character in the text and | |
* the offsets the Nodes corresponding to those characters, | |
* which the preprocess function prepared. | |
* | |
* Note that we will only ever need m counts at a time, and | |
* we reset them to @m when we are done with them, in case | |
* they are needed when the text wraps 256 characters. | |
*/ | |
if ( (off1 = (node = &alpha[*t++])->offset) >= 0 ) { | |
count[(i + off1) & MOD256]--; | |
for ( node = node->next; node != NULL; node = node->next ) | |
count[(i + node->offset) & MOD256]--; | |
} | |
/* If the count in @count corresponding to the current index in | |
* the text is no greater than @k, the number of mismatches we | |
* allow, then the pattern instance is reported as a fuzzy | |
* match. The position of the first letter in the match is | |
* calculated using the pattern length and the index of the last | |
* character in the match The number of mismatches is calculated | |
* from the number of matches. | |
*/ | |
if ( count[i & MOD256] <= k ) | |
match = Match_append_new( match, i - m + 1, count[i & MOD256], NULL ); | |
/* The count in @count corresponding to the current index in | |
* text is no longer needed, so we reset it to @m until we | |
* need it on the next wraparound. | |
*/ | |
count[i & MOD256] = m; | |
} | |
return match; | |
} | |
/* This is a test harness for the code in this file. | |
*/ | |
int main() | |
{ | |
char *text = "xxxadcxxx", *pattern = "abc"; | |
int n = strlen( text ), m = strlen( pattern ), k; | |
Match *match = NULL, *first = NULL; | |
/* Test Match class | |
*/ | |
printf("\nTesting Match class..\n\n"); | |
assert( !Match_length( match ) ); | |
first = match = Match_new( 0, 0, NULL ); | |
assert( Match_length( match ) == 1 ); | |
match = Match_last( match ); | |
assert( match ); | |
match = Match_append_new( match, 1, 1, NULL); | |
assert( match ); | |
assert( Match_length( match ) == 2 ); | |
match = Match_append_new( match, 2, 2, NULL); | |
assert( match ); | |
assert( Match_length( match ) == 3 ); | |
Match_print( first ); | |
Match_free( first ); | |
printf("\nDone testing Match class.\n\n"); | |
/* Test preprocess and search functions. | |
*/ | |
printf("\nTesting \"preprocess\" and \"search\" functions...\n"); | |
k = 0; | |
printf( "\n...with number of allowed errors k = %d\n", k ); | |
preprocess( pattern, m, alpha, count ); | |
match = search( text, n, m, k, alpha, count ); | |
Match_print( match ); | |
assert( Match_length( match ) == 0 ); | |
Match_free( match ); | |
k = 1; | |
printf( "\n...with number of allowed errors k = %d\n", k ); | |
preprocess( pattern, m, alpha, count ); | |
match = search( text, n, m, k, alpha, count ); | |
Match_print( match ); | |
assert( Match_length( match ) == 1 ); | |
Match_free( match ); | |
k = 2; | |
printf( "\n...with number of allowed errors k = %d\n", k ); | |
preprocess( pattern, m, alpha, count ); | |
match = search( text, n, m, k, alpha, count ); | |
Match_print( match ); | |
assert( Match_length( match ) == 1 ); | |
Match_free( match ); | |
k = 3; | |
printf( "\n...with number of allowed errors k = %d\n", k ); | |
preprocess( pattern, m, alpha, count ); | |
match = search( text, n, m, k, alpha, count ); | |
Match_print( match ); | |
assert( Match_length( match ) == n - m + 1 ); | |
Match_free( match ); | |
printf("\nDone testing \"preprocess\" and \"search\" functions.\n\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here is the output of the simple compiled example program: