angstyloop/g_match_with_mismatches_v3.c

## g_match_with_mismatches_v3.c
/* g_match_with_mismatches_v3.c
 *
 * Search for fuzzy matches of a pattern in text with @k or fewer mismatches
 * (substitutions). Uses doubly-linked list GList from GLib.
 *
 * COMPILE
 *
 * gcc `pkg-config --cflags glib-2.0` -o g_match_with_mismatches_v3 g_match_with_mismatches_v3.c `pkg-config --libs glib-2.0`
 *
 * RUN
 *
 * ./match_with_mismatches
 *
 * REFS
 *
 * The search and preprocess functions were taken from the example code from
 * "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates
 * and Chris H. Perleberg. I just modified the code to produce a GList of
 * match structs, and added more detailed comments, as well as a simple main()
 * function that uses the functions.
 */

#include <glib-2.0/glib.h>
#include <stdio.h>

/* Size of alpha index and count array. This number should be twice the size
 * of the alphabet, which is 128 in this case for ASCII (not extended).
 * The purpose of this extra space is explained later.
 */
#define ALPHABET_SIZE 256

/* The Match object will hold the index @i of the first character of the match
 * and the number of mismatched characters @k within that match. The Match
 * objects are also linked list nodes.
 */
typedef struct Match Match;
struct Match {
	int i;
	int k;
};

Match *
Match_new( int i, int k )
{
	Match *t;
	t = g_malloc( sizeof( Match ) );
	t->i = i;
	t->k = k;
	return t;
}

void
Match_free( gpointer match_, gpointer user_data )
{
	Match *match = (Match *) match_;
	if ( match )
		g_free( match );
}

void
Match_print( gpointer match_, gpointer user_data )
{
	Match *match = (Match *) match_;
	printf( "position: %d, errors: %d\n", match->i, match->k );
}

/* An array of lists. There are 128 linked lists in total - one for each
 * character in our 128-character alphabet. The last 128 lists characters are
 * added to the linked lists as needed.
 *
 * Each list will contain the offsets for each occurence of that character, or a
 * single placeholder offset of -1 if no occurences are found.
 *
 * The offset is the distance of the character to the left of the end of
 * pattern, (i.e., measured by counting to the left from the end of pattern),
 * for a given character and a given instance of pattern.
 */
GList *alpha[ALPHABET_SIZE];

/* This function initializes the @alpha and @count arrays to look like this:
 *
 *     alpha = [ [ -1 ] ] * ALPHABET_SIZE 	where the inner brackets are a GList.
 *
 *     count = [m] * m
 *
 * @alpha will be an array of linked lists. Characters in the pattern
 * that do not occur or that occur exactly once in the text will have
 * corresponding linked lists with length one. Characters in the pattern that
 * occur in the text more than once will have corresponding linked lists with
 * length greater than one.
 *
 * The first m - 1  elements of @count will be skipped on the first iteration of
 * the cyclic array (since no match can be shorter than the @pattern). Note that
 * the values in @count are reset to m once they are no longer needed, until the
 * next loop around @count.
 *
 * @p - pattern string
 * @m - pattern length
 * @alpha - array of GList. See above.
 * @count - circular buffer for counts of matches
 */
void preprocess( char *p, int m, GList *alpha[], int count[], int max_pattern_size )
{
	int i, j;

	for ( i = 0; i < ALPHABET_SIZE; i++ ) {
		alpha[i] = NULL;
		alpha[i] = g_list_append( alpha[i], GINT_TO_POINTER( -1 ) );
	}

	for ( i = 0, j = 128; i < m; i++, p++ ) {
		if ( GPOINTER_TO_INT( alpha[*p]->data ) == -1 )
			alpha[*p]->data = GINT_TO_POINTER( m - i - 1 );
		else
			alpha[*p] = g_list_append( alpha[*p],
				GINT_TO_POINTER( m - i - 1 ) );
	}

	for ( i = 0; i < max_pattern_size; i++ )
		count[i] = m;

}

void
increment_offset( gpointer off_, gpointer args_ )
{
	gpointer *args = (gpointer *) args_;
	int i = GPOINTER_TO_INT( args[0] );
	int max_pattern_size = GPOINTER_TO_INT( args[1] );
	int *count = (int *) args[2];
	gint off = GPOINTER_TO_INT( off_ ) ;
	count[(i + off) % max_pattern_size]--;
}

/* Find the position of the first character and number of mismatches of every
 * fuzzy match in a string @t with @k or fewer mismatches. Uses the array of
 * GList @alpha and the array of counts @count prepared by the preprocess
 * function.
 * @t - text string
 * @n - length of text string
 * @m - length of the pattern used to create @alpha and @count
 * @k - maximum number of allowed mismatches
 * @alpha - array of GList. See above.
 * @count - circular buffer for counts of matches
 */
GList *
search( char *t, int n, int m, int k, GList *alpha[], int count[], int max_pattern_size )
{
	int i, off, j;
	Match *match;
	GList *l0 = NULL, *l1 = NULL;

	/* Walk the text @t, which has length @n.
	 */
	for ( i = 0; i < n; i++ ) {
		/* If the current character in @t is in pattern, its
		 * corresponding list in @alpha will have a non-negative offset,
		 * thanks to the workdone by the preprocess function. If so, we
		 * need to decrement the counts in the circular buffer @count
		 * corresponding to the index of the character in the text and
		 * the offsets the lists corresponding to those characters,
		 * which the preprocess function prepared.
		 *
		 * Note that we will only ever need m counts at a time, and
		 * we reset them to @m when we are done with them, in case
		 * they are needed when the text wraps max_pattern_size
		 * characters.
		 */
		l0 = alpha[*t++];
		off = GPOINTER_TO_INT( l0->data );
		if ( off >= 0 ) {
			g_assert( l0 );
			gpointer t[3] = {
				GINT_TO_POINTER( i ),
				GINT_TO_POINTER( max_pattern_size ),
				(gpointer) count,
			};
			g_list_foreach( l0, increment_offset, t );
		}

		/* If the count in @count corresponding to the current index in
		 * the text is no greater than @k, the number of mismatches we
		 * allow, then the pattern instance is reported as a fuzzy
		 * match. The position of the first letter in the match is
		 * calculated using the pattern length and the index of the last
		 * character in the match The number of mismatches is calculated
		 * from the number of matches. The first m - 1 elements are
		 * skipped.
		 */
		if ( i >= m - 1 && count[i % max_pattern_size] <= k ) {
			g_assert( i - m + 1 >= 0 );
			match = Match_new( i - m + 1, count[i % max_pattern_size] );
			l1 = g_list_append( l1, match );
		}

		/* The count in @count corresponding to the current index in
		 * text is no longer needed, so we reset it to @m until we
		 * need it on the next wraparound.
		 */
		count[i % max_pattern_size] = m;
	}

	return l1;
}

/* This is a test harness for the code in this file.
 */
int main()
{

	/* Define the max pattern size. This can be INT_MAX (65535) if you want.
	 */
	const int max_pattern_size = 256;

	/* This array is used as a cyclic buffer for counts of the number of matching
	 * characters in a given instance of the pattern. The counts are maintained at
	 * the end of each pattern. When a pattern with k or fewer mismatches is found,
	 * it is reported. As the algorithm steps through the count array, it resets the
	 * counts it doesn't need anymore back to m, so they can be reused when the
	 * index in the text exceeds reaches 256 and needs to wrap around. The first m-1
	 * characters will be initialized to max_pattern_size, so they never have a valid
	 * number of mismatches.
	 */
	int count[max_pattern_size];

	char *text = "xxxadcxxx", *pattern = "abc";
	int n  = strlen( text ), m = strlen( pattern ), k;
	Match *match = NULL;
	GList *l0, *list;

	/* Test Match class
	 */
	printf( "\nTesting Match class..\n\n" );
	match = Match_new( 0, 0 );
	g_assert( match );
	Match_print( match, NULL );
	Match_free( match, NULL );
	printf( "\nDone testing Match class.\n\n" );

	/* Test preprocess and search functions.
	 */
	printf( "\nTesting \"preprocess\" and \"search\" functions...\n" );

	k = 0;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	g_list_foreach( list, Match_free, NULL  );
	if ( !g_list_length( list ) )
		printf( "No matches.\n" );

	k = 1;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 0 );
	g_assert(  GPOINTER_TO_INT( match->i ) == 3 );
	g_assert(  GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL  );

	k = 2;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 0 );
	g_assert(  GPOINTER_TO_INT( match->i ) == 3 );
	g_assert(  GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL  );

	k = 3;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 3 );
	g_assert(  GPOINTER_TO_INT( match->i ) == 3 );
	g_assert(  GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL  );

	printf( "\nDone testing \"preprocess\" and \"search\" functions.\n\n" );

	return 0;
}
	/* g_match_with_mismatches_v3.c
	*
	* Search for fuzzy matches of a pattern in text with @k or fewer mismatches
	* (substitutions). Uses doubly-linked list GList from GLib.
	*
	* COMPILE
	*
	* gcc `pkg-config --cflags glib-2.0` -o g_match_with_mismatches_v3 g_match_with_mismatches_v3.c `pkg-config --libs glib-2.0`
	*
	* RUN
	*
	* ./match_with_mismatches
	*
	* REFS
	*
	* The search and preprocess functions were taken from the example code from
	* "Fast and Practical Approximate String Matching" by Ricardo A. Baeza-Yates
	* and Chris H. Perleberg. I just modified the code to produce a GList of
	* match structs, and added more detailed comments, as well as a simple main()
	* function that uses the functions.
	*/

	#include <glib-2.0/glib.h>
	#include <stdio.h>

	/* Size of alpha index and count array. This number should be twice the size
	* of the alphabet, which is 128 in this case for ASCII (not extended).
	* The purpose of this extra space is explained later.
	*/
	#define ALPHABET_SIZE 256

	/* The Match object will hold the index @i of the first character of the match
	* and the number of mismatched characters @k within that match. The Match
	* objects are also linked list nodes.
	*/
	typedef struct Match Match;
	struct Match {
	int i;
	int k;
	};

	Match *
	Match_new( int i, int k )
	{
	Match *t;
	t = g_malloc( sizeof( Match ) );
	t->i = i;
	t->k = k;
	return t;
	}

	void
	Match_free( gpointer match_, gpointer user_data )
	{
	Match match = (Match ) match_;
	if ( match )
	g_free( match );
	}

	void
	Match_print( gpointer match_, gpointer user_data )
	{
	Match match = (Match ) match_;
	printf( "position: %d, errors: %d\n", match->i, match->k );
	}

	/* An array of lists. There are 128 linked lists in total - one for each
	* character in our 128-character alphabet. The last 128 lists characters are
	* added to the linked lists as needed.
	*
	* Each list will contain the offsets for each occurence of that character, or a
	* single placeholder offset of -1 if no occurences are found.
	*
	* The offset is the distance of the character to the left of the end of
	* pattern, (i.e., measured by counting to the left from the end of pattern),
	* for a given character and a given instance of pattern.
	*/
	GList *alpha[ALPHABET_SIZE];

	/* This function initializes the @alpha and @count arrays to look like this:
	*
	* alpha = [ [ -1 ] ] * ALPHABET_SIZE where the inner brackets are a GList.
	*
	* count = [m] * m
	*
	* @alpha will be an array of linked lists. Characters in the pattern
	* that do not occur or that occur exactly once in the text will have
	* corresponding linked lists with length one. Characters in the pattern that
	* occur in the text more than once will have corresponding linked lists with
	* length greater than one.
	*
	* The first m - 1 elements of @count will be skipped on the first iteration of
	* the cyclic array (since no match can be shorter than the @pattern). Note that
	* the values in @count are reset to m once they are no longer needed, until the
	* next loop around @count.
	*
	* @p - pattern string
	* @m - pattern length
	* @alpha - array of GList. See above.
	* @count - circular buffer for counts of matches
	*/
	void preprocess( char p, int m, GList alpha[], int count[], int max_pattern_size )
	{
	int i, j;

	for ( i = 0; i < ALPHABET_SIZE; i++ ) {
	alpha[i] = NULL;
	alpha[i] = g_list_append( alpha[i], GINT_TO_POINTER( -1 ) );
	}

	for ( i = 0, j = 128; i < m; i++, p++ ) {
	if ( GPOINTER_TO_INT( alpha[*p]->data ) == -1 )
	alpha[*p]->data = GINT_TO_POINTER( m - i - 1 );
	else
	alpha[p] = g_list_append( alpha[p],
	GINT_TO_POINTER( m - i - 1 ) );
	}

	for ( i = 0; i < max_pattern_size; i++ )
	count[i] = m;

	}

	void
	increment_offset( gpointer off_, gpointer args_ )
	{
	gpointer args = (gpointer ) args_;
	int i = GPOINTER_TO_INT( args[0] );
	int max_pattern_size = GPOINTER_TO_INT( args[1] );
	int count = (int ) args[2];
	gint off = GPOINTER_TO_INT( off_ ) ;
	count[(i + off) % max_pattern_size]--;
	}

	/* Find the position of the first character and number of mismatches of every
	* fuzzy match in a string @t with @k or fewer mismatches. Uses the array of
	* GList @alpha and the array of counts @count prepared by the preprocess
	* function.
	* @t - text string
	* @n - length of text string
	* @m - length of the pattern used to create @alpha and @count
	* @k - maximum number of allowed mismatches
	* @alpha - array of GList. See above.
	* @count - circular buffer for counts of matches
	*/
	GList *
	search( char t, int n, int m, int k, GList alpha[], int count[], int max_pattern_size )
	{
	int i, off, j;
	Match *match;
	GList l0 = NULL, l1 = NULL;

	/* Walk the text @t, which has length @n.
	*/
	for ( i = 0; i < n; i++ ) {
	/* If the current character in @t is in pattern, its
	* corresponding list in @alpha will have a non-negative offset,
	* thanks to the workdone by the preprocess function. If so, we
	* need to decrement the counts in the circular buffer @count
	* corresponding to the index of the character in the text and
	* the offsets the lists corresponding to those characters,
	* which the preprocess function prepared.
	*
	* Note that we will only ever need m counts at a time, and
	* we reset them to @m when we are done with them, in case
	* they are needed when the text wraps max_pattern_size
	* characters.
	*/
	l0 = alpha[*t++];
	off = GPOINTER_TO_INT( l0->data );
	if ( off >= 0 ) {
	g_assert( l0 );
	gpointer t[3] = {
	GINT_TO_POINTER( i ),
	GINT_TO_POINTER( max_pattern_size ),
	(gpointer) count,
	};
	g_list_foreach( l0, increment_offset, t );
	}

	/* If the count in @count corresponding to the current index in
	* the text is no greater than @k, the number of mismatches we
	* allow, then the pattern instance is reported as a fuzzy
	* match. The position of the first letter in the match is
	* calculated using the pattern length and the index of the last
	* character in the match The number of mismatches is calculated
	* from the number of matches. The first m - 1 elements are
	* skipped.
	*/
	if ( i >= m - 1 && count[i % max_pattern_size] <= k ) {
	g_assert( i - m + 1 >= 0 );
	match = Match_new( i - m + 1, count[i % max_pattern_size] );
	l1 = g_list_append( l1, match );
	}

	/* The count in @count corresponding to the current index in
	* text is no longer needed, so we reset it to @m until we
	* need it on the next wraparound.
	*/
	count[i % max_pattern_size] = m;
	}

	return l1;
	}

	/* This is a test harness for the code in this file.
	*/
	int main()
	{

	/* Define the max pattern size. This can be INT_MAX (65535) if you want.
	*/
	const int max_pattern_size = 256;

	/* This array is used as a cyclic buffer for counts of the number of matching
	* characters in a given instance of the pattern. The counts are maintained at
	* the end of each pattern. When a pattern with k or fewer mismatches is found,
	* it is reported. As the algorithm steps through the count array, it resets the
	* counts it doesn't need anymore back to m, so they can be reused when the
	* index in the text exceeds reaches 256 and needs to wrap around. The first m-1
	* characters will be initialized to max_pattern_size, so they never have a valid
	* number of mismatches.
	*/
	int count[max_pattern_size];

	char text = "xxxadcxxx", pattern = "abc";
	int n = strlen( text ), m = strlen( pattern ), k;
	Match *match = NULL;
	GList l0, list;

	/* Test Match class
	*/
	printf( "\nTesting Match class..\n\n" );
	match = Match_new( 0, 0 );
	g_assert( match );
	Match_print( match, NULL );
	Match_free( match, NULL );
	printf( "\nDone testing Match class.\n\n" );

	/* Test preprocess and search functions.
	*/
	printf( "\nTesting \"preprocess\" and \"search\" functions...\n" );

	k = 0;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	g_list_foreach( list, Match_free, NULL );
	if ( !g_list_length( list ) )
	printf( "No matches.\n" );

	k = 1;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 0 );
	g_assert( GPOINTER_TO_INT( match->i ) == 3 );
	g_assert( GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL );

	k = 2;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 0 );
	g_assert( GPOINTER_TO_INT( match->i ) == 3 );
	g_assert( GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL );

	k = 3;
	printf( "\n...with number of allowed errors k = %d\n", k );
	preprocess( pattern, m, alpha, count, max_pattern_size );
	list = search( text, n, m, k, alpha, count, max_pattern_size );
	g_list_foreach( list, Match_print, NULL );
	match = (Match *) g_list_nth_data( list , 3 );
	g_assert( GPOINTER_TO_INT( match->i ) == 3 );
	g_assert( GPOINTER_TO_INT( match->k ) == 1 );
	g_list_foreach( list, Match_free, NULL );

	printf( "\nDone testing \"preprocess\" and \"search\" functions.\n\n" );

	return 0;
	}