angstyloop/match_with_mismatches.c

## match_with_mismatches.c
/* match_with_mismatches.c
 *
 * Search for fuzzy matches of a pattern in text with @k or fewer mismatches
 * (substitutions).
 *
 * COMPILE
 *
 * gcc -o match_with_mismatches match_with_mismatches.c
 *
 * RUN
 *
 * ./match_with_mismatches
 *
 * REFS
 *
 * The search and preprocess functions were taken from the example code from
 * "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates
 * and Chris H. Perleberg. I just modified the code to produce a linked list of
 * match structs, and added more detailed comments, as well as a simple main()
 * function that uses the functions.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <assert.h>

/* Size of alpha index and count array. This number should be twice the size
 * of the alphabet, which is 128 in this case for ASCII (not extended).
 * The purpose of this extra space is explained later.
 */
#define SIZE 256

/* Used to mod 256 quickly instead of with the modulo operator. we can do this
 * because 256 is a power of two, and modding by a power of two is the same as
 *bitwise and with the same power of two. Math!
 */
#define MOD256 0xff

/* The nodes of a linked list. Each Node contains an offset of a mismatched
 * letter within a match ending at that letter, for a given letter. The offset
 * counts starting from the last character in the match and increasing to the
 * left.
 *
 * @offset - the distance of the character to the left of the end of pattern,
 * (i.e., measured by counting to the left from the end of pattern)
 *
 * @next - the next Node if it exists, or NULL otherwise.
 */
typedef struct Node Node;
struct Node {
	int offset;	/* distance of char from start of pattern */
	Node *next;	/* pointer to next idxnode if it exists */
};

/* The Match object will hold the index @i of the first character of the match
 * and the number of mismatched characters @k within that match. The Match
 * objects are also linked list nodes.
 */
typedef struct Match Match;
struct Match {
	int i;
	int k;
	Match *next;
};

Match *
Match_new( int i, int k, Match *next )
{
	Match *t;
	t = calloc( 1, sizeof( Match ) );
	t->i = i;
	t->k = k;
	t->next = next;
	return t;
}

Match *
Match_last( Match *match )
{
	while ( match->next )
		match = match->next;
	return match;
}

Match *
Match_append_new( Match* match, int i, int k, Match *next )
{
	Match *s, *t;
	s = Match_new( i, k, next );
	if ( !match )
		return s;
	t = Match_last( match );
	t->next = s;
	return match;
}

Match *
Match_free( Match *match )
{
	Match *t;
	while ( (t = match) ) {
		match = t->next;
		free(t);
	}
}

void Match_print( Match *match )
{
	while ( match ) {
		printf( "Match in position %d with %d mismatches.\n",
			match->i, match->k );
		match = match->next;
	}
}

int Match_length( Match *match )
{
	int l = 0;
	while ( match ) {
		l++;
		match = match->next;
	}
	return l;
}


/* An array of Nodes. The first 128 Nodes are the head Nodes of linked lists.
 * This means there are 128 linked lists in total - one for each character in
 * our 128-character alphabet. The last 128 Nodes are added to the linked
 * lists as needed. It is more efficient on average to just allocate space for
 * 128 Nodes statically - the most we might need - even if fewer Nodes are
 * used, instead of dynamically allocating nodes as needed. Using a power of
 * two also lets us quickly access the array in a cyclic manner by modding
 * using the bitwise & operator.
 */
Node alpha[SIZE];

/* This array is used as a cyclic buffer for counts of the number of matching
 * characters in a given instance of the pattern. The counts are maintained at
 * the end of each pattern. When a pattern with k or fewer mismatches is found,
 * it is reported. As the algorithm steps through the count array, it resets the
 * counts it doesn't need anymore back to m, so they can be reused when the
 * index in the text exceeds reaches 256 and needs to wrap around. The first m-1
 * characters will be initialized to SIZE, so they never have a valid number of
 * mismatches. Note that by making this optimization we are implicitly limiting
 * the pattern length to 256 characters.
 */
int count[SIZE];

/* This function initializes the @alpha and @count arrays to look like this:
 *
 *     alpha = [{ offset: -1, next: Node | NULL }] * 256
 *
 *     count = [256] * (m - 1) + [m] * (256 - m + 1)
 *
 * @alpha will be a list of heads of linked lists. Characters in the pattern
 * that do not occur or that occur exactly once in the text will have
 * corresponding linked lists with length one. Characters in the pattern that
 * occur in the text more than once will have corresponding linked lists with
 * length greater than one.
 *
 * The first m - 1  elements of @count will be initialized to SIZE=256, so that
 * the count never triggers a match for the first m - 1 characters. Such a match
 * would be shorter than the pattern, which is not allowed since we are only
 * considering mismatches (substitutions), not insertions or deletions.
 *
 * @p - pattern string
 * @m - pattern length
 * @alpha - array of Nodes. See above.
 * @count - circular buffer for counts of matches
 */
void preprocess( char *p, int m, Node alpha[], int count[] )
{
	int i, j;
	Node *node;

	for ( i = 0; i < SIZE; i++ ) {
		alpha[i].offset = -1;
		alpha[i].next = NULL;
		count[i] = m;
	}

	for ( i = 0, j = 128; i < m; i++, p++ ) {
		count[i] = SIZE;

		if ( alpha[*p].offset == -1 )
			alpha[*p].offset = m - i - 1;
		else {
			node = alpha[*p].next;
			alpha[*p].next = &alpha[j++];
			alpha[*p].next->offset = m - i - 1;
			alpha[*p].next->next = node;
		}
	}

	count[m - 1] = m;
}

/* Find the position of the first character and number of mismatches of every
 * fuzzy match in a string @t with @k or fewer mismatches. Uses the array of
 * Nodes @alpha and the array of counts @count prepared by the preprocess
 * function.
 * @t - text string
 * @n - length of text string
 * @m - length of the pattern used to create @alpha and @count
 * @k - maximum number of allowed mismatches
 * @alpha - array of Nodes. See above.
 * @count - circular buffer for counts of matches
 */
Match *
search( char *t, int n, int m, int k, Node alpha[], int count[] )
{
	int i, off1;
	Node *node;
	Match *match = NULL;

	/* Walk the text @t, which has length @n.
	 */
	for ( i = 0; i < n; i++ ) {
		/* If the current character in @t is in pattern, its
		 * corresponding Node in @alpha will have a non-negative offset,
		 * thanks to the workdone by the preprocess function. If so, we
		 * need to decrement the counts in the circular buffer @count
		 * corresponding to the index of the character in the text and
		 * the offsets the Nodes corresponding to those characters,
		 * which the preprocess function prepared.
		 *
		 * Note that we will only ever need m counts at a time, and
		 * we reset them to @m when we are done with them, in case
		 * they are needed when the text wraps 256 characters.
		 */
		if ( (off1 = (node = &alpha[*t++])->offset) >= 0 ) {
			count[(i + off1) & MOD256]--;

			for ( node = node->next; node != NULL; node = node->next )
				count[(i + node->offset) & MOD256]--;
		}

		/* If the count in @count corresponding to the current index in
		 * the text is no greater than @k, the number of mismatches we
		 * allow, then the pattern instance is reported as a fuzzy
		 * match. The position of the first letter in the match is
		 * calculated using the pattern length and the index of the last
		 * character in the match The number of mismatches is calculated
		 * from the number of matches.
		 */
		if ( count[i & MOD256] <= k )
			match = Match_append_new( match, i - m + 1, count[i & MOD256], NULL );

		/* The count in @count corresponding to the current index in
		 * text is no longer needed, so we reset it to @m until we
		 * need it on the next wraparound.
		 */
		count[i & MOD256] = m;
	}

	return match;
}

/* This is a test harness for the code in this file.
 */
int main()
{
	char *text = "xxxadcxxx", *pattern = "abc";
	int n  = strlen( text ), m = strlen( pattern ), k;
	Match *match = NULL, *first = NULL;

	/* Test Match class
	 */
	printf("\nTesting Match class..\n\n");
	assert( !Match_length( match ) );
	first = match = Match_new( 0, 0, NULL );
	assert( Match_length( match ) == 1 );
	match = Match_last( match );
	assert( match );
	match = Match_append_new( match, 1, 1, NULL);
	assert( match );
	assert( Match_length( match ) == 2 );
	match = Match_append_new( match, 2, 2, NULL);
	assert( match );
	assert( Match_length( match ) == 3 );
	Match_print( first );
	Match_free( first );
	printf("\nDone testing Match class.\n\n");

	/* Test preprocess and search functions.
	 */
	printf("\nTesting \"preprocess\" and \"search\" functions...\n");

	k = 0;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 0 );
	Match_free( match );

	k = 1;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 1 );
	Match_free( match );

	k = 2;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 1 );
	Match_free( match );

	k = 3;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == n - m + 1 );
	Match_free( match );

	printf("\nDone testing \"preprocess\" and \"search\" functions.\n\n");

	return 0;
}
	/* match_with_mismatches.c
	*
	* Search for fuzzy matches of a pattern in text with @k or fewer mismatches
	* (substitutions).
	*
	* COMPILE
	*
	* gcc -o match_with_mismatches match_with_mismatches.c
	*
	* RUN
	*
	* ./match_with_mismatches
	*
	* REFS
	*
	* The search and preprocess functions were taken from the example code from
	* "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates
	* and Chris H. Perleberg. I just modified the code to produce a linked list of
	* match structs, and added more detailed comments, as well as a simple main()
	* function that uses the functions.
	*/

	#include <stdlib.h>
	#include <stdio.h>
	#include <string.h>

	#include <assert.h>

	/* Size of alpha index and count array. This number should be twice the size
	* of the alphabet, which is 128 in this case for ASCII (not extended).
	* The purpose of this extra space is explained later.
	*/
	#define SIZE 256

	/* Used to mod 256 quickly instead of with the modulo operator. we can do this
	* because 256 is a power of two, and modding by a power of two is the same as
	*bitwise and with the same power of two. Math!
	*/
	#define MOD256 0xff

	/* The nodes of a linked list. Each Node contains an offset of a mismatched
	* letter within a match ending at that letter, for a given letter. The offset
	* counts starting from the last character in the match and increasing to the
	* left.
	*
	* @offset - the distance of the character to the left of the end of pattern,
	* (i.e., measured by counting to the left from the end of pattern)
	*
	* @next - the next Node if it exists, or NULL otherwise.
	*/
	typedef struct Node Node;
	struct Node {
	int offset; /* distance of char from start of pattern */
	Node next; / pointer to next idxnode if it exists */
	};

	/* The Match object will hold the index @i of the first character of the match
	* and the number of mismatched characters @k within that match. The Match
	* objects are also linked list nodes.
	*/
	typedef struct Match Match;
	struct Match {
	int i;
	int k;
	Match *next;
	};

	Match *
	Match_new( int i, int k, Match *next )
	{
	Match *t;
	t = calloc( 1, sizeof( Match ) );
	t->i = i;
	t->k = k;
	t->next = next;
	return t;
	}

	Match *
	Match_last( Match *match )
	{
	while ( match->next )
	match = match->next;
	return match;
	}

	Match *
	Match_append_new( Match* match, int i, int k, Match *next )
	{
	Match s, t;
	s = Match_new( i, k, next );
	if ( !match )
	return s;
	t = Match_last( match );
	t->next = s;
	return match;
	}

	Match *
	Match_free( Match *match )
	{
	Match *t;
	while ( (t = match) ) {
	match = t->next;
	free(t);
	}
	}

	void Match_print( Match *match )
	{
	while ( match ) {
	printf( "Match in position %d with %d mismatches.\n",
	match->i, match->k );
	match = match->next;
	}
	}

	int Match_length( Match *match )
	{
	int l = 0;
	while ( match ) {
	l++;
	match = match->next;
	}
	return l;
	}


	/* An array of Nodes. The first 128 Nodes are the head Nodes of linked lists.
	* This means there are 128 linked lists in total - one for each character in
	* our 128-character alphabet. The last 128 Nodes are added to the linked
	* lists as needed. It is more efficient on average to just allocate space for
	* 128 Nodes statically - the most we might need - even if fewer Nodes are
	* used, instead of dynamically allocating nodes as needed. Using a power of
	* two also lets us quickly access the array in a cyclic manner by modding
	* using the bitwise & operator.
	*/
	Node alpha[SIZE];

	/* This array is used as a cyclic buffer for counts of the number of matching
	* characters in a given instance of the pattern. The counts are maintained at
	* the end of each pattern. When a pattern with k or fewer mismatches is found,
	* it is reported. As the algorithm steps through the count array, it resets the
	* counts it doesn't need anymore back to m, so they can be reused when the
	* index in the text exceeds reaches 256 and needs to wrap around. The first m-1
	* characters will be initialized to SIZE, so they never have a valid number of
	* mismatches. Note that by making this optimization we are implicitly limiting
	* the pattern length to 256 characters.
	*/
	int count[SIZE];

	/* This function initializes the @alpha and @count arrays to look like this:
	*
	* alpha = [{ offset: -1, next: Node \| NULL }] * 256
	*
	* count = [256] * (m - 1) + [m] * (256 - m + 1)
	*
	* @alpha will be a list of heads of linked lists. Characters in the pattern
	* that do not occur or that occur exactly once in the text will have
	* corresponding linked lists with length one. Characters in the pattern that
	* occur in the text more than once will have corresponding linked lists with
	* length greater than one.
	*
	* The first m - 1 elements of @count will be initialized to SIZE=256, so that
	* the count never triggers a match for the first m - 1 characters. Such a match
	* would be shorter than the pattern, which is not allowed since we are only
	* considering mismatches (substitutions), not insertions or deletions.
	*
	* @p - pattern string
	* @m - pattern length
	* @alpha - array of Nodes. See above.
	* @count - circular buffer for counts of matches
	*/
	void preprocess( char *p, int m, Node alpha[], int count[] )
	{
	int i, j;
	Node *node;

	for ( i = 0; i < SIZE; i++ ) {
	alpha[i].offset = -1;
	alpha[i].next = NULL;
	count[i] = m;
	}

	for ( i = 0, j = 128; i < m; i++, p++ ) {
	count[i] = SIZE;

	if ( alpha[*p].offset == -1 )
	alpha[*p].offset = m - i - 1;
	else {
	node = alpha[*p].next;
	alpha[*p].next = &alpha[j++];
	alpha[*p].next->offset = m - i - 1;
	alpha[*p].next->next = node;
	}
	}

	count[m - 1] = m;
	}

	/* Find the position of the first character and number of mismatches of every
	* fuzzy match in a string @t with @k or fewer mismatches. Uses the array of
	* Nodes @alpha and the array of counts @count prepared by the preprocess
	* function.
	* @t - text string
	* @n - length of text string
	* @m - length of the pattern used to create @alpha and @count
	* @k - maximum number of allowed mismatches
	* @alpha - array of Nodes. See above.
	* @count - circular buffer for counts of matches
	*/
	Match *
	search( char *t, int n, int m, int k, Node alpha[], int count[] )
	{
	int i, off1;
	Node *node;
	Match *match = NULL;

	/* Walk the text @t, which has length @n.
	*/
	for ( i = 0; i < n; i++ ) {
	/* If the current character in @t is in pattern, its
	* corresponding Node in @alpha will have a non-negative offset,
	* thanks to the workdone by the preprocess function. If so, we
	* need to decrement the counts in the circular buffer @count
	* corresponding to the index of the character in the text and
	* the offsets the Nodes corresponding to those characters,
	* which the preprocess function prepared.
	*
	* Note that we will only ever need m counts at a time, and
	* we reset them to @m when we are done with them, in case
	* they are needed when the text wraps 256 characters.
	*/
	if ( (off1 = (node = &alpha[*t++])->offset) >= 0 ) {
	count[(i + off1) & MOD256]--;

	for ( node = node->next; node != NULL; node = node->next )
	count[(i + node->offset) & MOD256]--;
	}

	/* If the count in @count corresponding to the current index in
	* the text is no greater than @k, the number of mismatches we
	* allow, then the pattern instance is reported as a fuzzy
	* match. The position of the first letter in the match is
	* calculated using the pattern length and the index of the last
	* character in the match The number of mismatches is calculated
	* from the number of matches.
	*/
	if ( count[i & MOD256] <= k )
	match = Match_append_new( match, i - m + 1, count[i & MOD256], NULL );

	/* The count in @count corresponding to the current index in
	* text is no longer needed, so we reset it to @m until we
	* need it on the next wraparound.
	*/
	count[i & MOD256] = m;
	}

	return match;
	}

	/* This is a test harness for the code in this file.
	*/
	int main()
	{
	char text = "xxxadcxxx", pattern = "abc";
	int n = strlen( text ), m = strlen( pattern ), k;
	Match match = NULL, first = NULL;

	/* Test Match class
	*/
	printf("\nTesting Match class..\n\n");
	assert( !Match_length( match ) );
	first = match = Match_new( 0, 0, NULL );
	assert( Match_length( match ) == 1 );
	match = Match_last( match );
	assert( match );
	match = Match_append_new( match, 1, 1, NULL);
	assert( match );
	assert( Match_length( match ) == 2 );
	match = Match_append_new( match, 2, 2, NULL);
	assert( match );
	assert( Match_length( match ) == 3 );
	Match_print( first );
	Match_free( first );
	printf("\nDone testing Match class.\n\n");

	/* Test preprocess and search functions.
	*/
	printf("\nTesting \"preprocess\" and \"search\" functions...\n");

	k = 0;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 0 );
	Match_free( match );

	k = 1;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 1 );
	Match_free( match );

	k = 2;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == 1 );
	Match_free( match );

	k = 3;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count );
	match = search( text, n, m, k, alpha, count );
	Match_print( match );
	assert( Match_length( match ) == n - m + 1 );
	Match_free( match );

	printf("\nDone testing \"preprocess\" and \"search\" functions.\n\n");

	return 0;
	}