edhemphill/regextest.cpp

## regextest.cpp
// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/
// The following code is freeware:
// regextest.c
// Author: ed
//
// Simple test program for regex using ICU's regex matching.

#include <stdio.h>
#include <getopt.h>
#include <string.h>
#ifdef __cplusplus
#include <cstdlib>
#endif
#include <unicode/regex.h>
#include <unicode/utext.h>
#include <unicode/errorcode.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>

static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n"
								  "-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n"
								  "You need to escape these items like this: \\\" and \\$ b/c of bash.\n"
								  "\n";

/* ICU library libs/programs...
 * http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html
 * http://icu-project.org/download/4.4.html#ICU4C
 *
 */

int main(int argc, char *argv[]) {
	int exitcode = 1;
	bool replace_mode=false;
    int c;
    int digit_optind = 0;

    while (1)
      {
        c = getopt (argc, argv, "r");
        if (c == -1)
        	break;

        switch (c)
          {
          case 'r':
        	  replace_mode = true;
        	  break;
          }
      }

	if(argc-optind < 2) {
		printf("%s",OptionsInfo);
		exit(1);
	}
	if(replace_mode && argc-optind < 3) {
		printf("%s",OptionsInfo);
		exit(1);
	}


	printf("pattern:     -->[%s]<--\n", argv[optind]);
	printf("test on:     -->[%s]<--\n", argv[optind+1]);

	UErrorCode        status    = U_ZERO_ERROR;
	UText *regex1 = NULL;
	UText *matchthis = NULL;

	//	static const char *regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.*)\"";

	regex1 = utext_openUTF8(regex1, argv[optind], -1, &status);
	//regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status);
	matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status);

	RegexMatcher *matcher = new RegexMatcher(regex1, 0, status);
	if (U_FAILURE(status)) {
		// Handle any syntax errors in the regular expression here
		printf("Syntax error in regex?\n");
		icu::ErrorCode ec;
		ec.set(status);
		printf("Error was: %s\n",ec.errorName());
		utext_close(regex1);
		utext_close(matchthis);
		delete matcher;
		exit(exitcode);
	}

	if(!replace_mode) { // MATCH TEST

		//	UnicodeString    stringToTest = "Find the abc in this string";
		matcher->reset(matchthis);

		if (matcher->matches(status)) {
			// We found a match.
			printf("MATCH: string matches regex\n");
			//	   int startOfMatch = matcher->start(status);   // string index of start of match.

			if(matcher->groupCount() > 0) {
				UConverter *conv = ucnv_open("US-ASCII", &status);
				for(int x=0;x<=matcher->groupCount();x++) {
					//				UText *grp = NULL;
					UnicodeString US = matcher->group(x, status);
					if (U_FAILURE(status)) {
						icu::ErrorCode ec;
						ec.set(status);
						printf("Error was: %s\n",ec.errorName());
					} else {
						UChar *out = (UChar *) malloc(1000);
						char *outcs = (char *) malloc(1000);
						US.extract(out,1000,status);
						ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
						printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
						//					printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
						if(out) free(out);
						if(outcs) free(outcs);
					}
				}
			}

			exitcode = 0;
		} else {
			printf("FAIL: no match\n");
			exitcode= 2;
		}

	} else { // REPLACER TEST
		UText *replacedtxt = NULL;
		UText *replacement = NULL;
		replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status);
		matcher->reset(matchthis);
		printf("replacement: -->[%s]<--\n", argv[optind+2]);

		replacedtxt = matcher->replaceAll(replacement,replacedtxt,status);
		if (U_FAILURE(status)) {
			// Handle any syntax errors in the regular expression here
			printf("Syntax error in regex?\n");
			icu::ErrorCode ec;
			ec.set(status);
			printf("Error was: %s\n",ec.errorName());
			utext_close(regex1);
			utext_close(matchthis);
			utext_close(replacement);
			delete matcher;
			exit(exitcode);
		}

		if(replacedtxt) {
			// Replacement did something...
			printf("REPLACE: string replaced...\n");
			//	   int startOfMatch = matcher->start(status);   // string index of start of match.
			UChar buf[500];
			printf("new length: %ld\n", utext_nativeLength(replacedtxt));
			utext_extract(replacedtxt,0,500,buf,500,&status);

			if (U_FAILURE(status)) {
				// Handle any syntax errors in the regular expression here
				printf("Syntax error in extraction:\n");
				icu::ErrorCode ec;
				ec.set(status);
				printf("Error was: %s\n",ec.errorName());
				utext_close(regex1);
				utext_close(matchthis);
				utext_close(replacedtxt);
				utext_close(replacement);
				delete matcher;
				exit(exitcode);
			}

			// Yawn... and convert to ascii.  Can we make it any more complicated??
			UConverter *conv = ucnv_open("US-ASCII", &status);
			char *outcs = (char *) malloc(1000);
			ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status);

			// Also, use function u_strFromUTF8 / u_strToUTF8

			if (U_FAILURE(status)) {
				// Handle any syntax errors in the regular expression here
				printf("Syntax error in extraction:\n");
				icu::ErrorCode ec;
				ec.set(status);
				printf("Error was: %s\n",ec.errorName());
				utext_close(regex1);
				utext_close(matchthis);
				utext_close(replacedtxt);
				utext_close(replacement);
				free(outcs);
				delete matcher;
				exit(exitcode);
			}
			ucnv_close(conv);

			printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8
			//					printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
			if(outcs) free(outcs);


			utext_close(replacedtxt);
		} else {
			printf("Result: No replacement.\n");
		}

		utext_close(replacement);

	}
	utext_close(regex1);
	utext_close(matchthis);
	delete matcher;
	exit(exitcode);
}

/*
 *   Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest
 */
	// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/
	// The following code is freeware:
	// regextest.c
	// Author: ed
	//
	// Simple test program for regex using ICU's regex matching.

	#include <stdio.h>
	#include <getopt.h>
	#include <string.h>
	#ifdef __cplusplus
	#include <cstdlib>
	#endif
	#include <unicode/regex.h>
	#include <unicode/utext.h>
	#include <unicode/errorcode.h>
	#include <unicode/ucnv.h>
	#include <unicode/utypes.h>
	#include <unicode/uchar.h>

	static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n"
	"-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n"
	"You need to escape these items like this: \\\" and \\$ b/c of bash.\n"
	"\n";

	/* ICU library libs/programs...
	* http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html
	* http://icu-project.org/download/4.4.html#ICU4C
	*
	*/

	int main(int argc, char *argv[]) {
	int exitcode = 1;
	bool replace_mode=false;
	int c;
	int digit_optind = 0;

	while (1)
	{
	c = getopt (argc, argv, "r");
	if (c == -1)
	break;

	switch (c)
	{
	case 'r':
	replace_mode = true;
	break;
	}
	}

	if(argc-optind < 2) {
	printf("%s",OptionsInfo);
	exit(1);
	}
	if(replace_mode && argc-optind < 3) {
	printf("%s",OptionsInfo);
	exit(1);
	}


	printf("pattern: -->[%s]<--\n", argv[optind]);
	printf("test on: -->[%s]<--\n", argv[optind+1]);

	UErrorCode status = U_ZERO_ERROR;
	UText *regex1 = NULL;
	UText *matchthis = NULL;

	// static const char regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.)\"";

	regex1 = utext_openUTF8(regex1, argv[optind], -1, &status);
	//regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status);
	matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status);

	RegexMatcher *matcher = new RegexMatcher(regex1, 0, status);
	if (U_FAILURE(status)) {
	// Handle any syntax errors in the regular expression here
	printf("Syntax error in regex?\n");
	icu::ErrorCode ec;
	ec.set(status);
	printf("Error was: %s\n",ec.errorName());
	utext_close(regex1);
	utext_close(matchthis);
	delete matcher;
	exit(exitcode);
	}

	if(!replace_mode) { // MATCH TEST

	// UnicodeString stringToTest = "Find the abc in this string";
	matcher->reset(matchthis);

	if (matcher->matches(status)) {
	// We found a match.
	printf("MATCH: string matches regex\n");
	// int startOfMatch = matcher->start(status); // string index of start of match.

	if(matcher->groupCount() > 0) {
	UConverter *conv = ucnv_open("US-ASCII", &status);
	for(int x=0;x<=matcher->groupCount();x++) {
	// UText *grp = NULL;
	UnicodeString US = matcher->group(x, status);
	if (U_FAILURE(status)) {
	icu::ErrorCode ec;
	ec.set(status);
	printf("Error was: %s\n",ec.errorName());
	} else {
	UChar out = (UChar ) malloc(1000);
	char outcs = (char ) malloc(1000);
	US.extract(out,1000,status);
	ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
	printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
	// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
	if(out) free(out);
	if(outcs) free(outcs);
	}
	}
	}

	exitcode = 0;
	} else {
	printf("FAIL: no match\n");
	exitcode= 2;
	}

	} else { // REPLACER TEST
	UText *replacedtxt = NULL;
	UText *replacement = NULL;
	replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status);
	matcher->reset(matchthis);
	printf("replacement: -->[%s]<--\n", argv[optind+2]);

	replacedtxt = matcher->replaceAll(replacement,replacedtxt,status);
	if (U_FAILURE(status)) {
	// Handle any syntax errors in the regular expression here
	printf("Syntax error in regex?\n");
	icu::ErrorCode ec;
	ec.set(status);
	printf("Error was: %s\n",ec.errorName());
	utext_close(regex1);
	utext_close(matchthis);
	utext_close(replacement);
	delete matcher;
	exit(exitcode);
	}

	if(replacedtxt) {
	// Replacement did something...
	printf("REPLACE: string replaced...\n");
	// int startOfMatch = matcher->start(status); // string index of start of match.
	UChar buf[500];
	printf("new length: %ld\n", utext_nativeLength(replacedtxt));
	utext_extract(replacedtxt,0,500,buf,500,&status);

	if (U_FAILURE(status)) {
	// Handle any syntax errors in the regular expression here
	printf("Syntax error in extraction:\n");
	icu::ErrorCode ec;
	ec.set(status);
	printf("Error was: %s\n",ec.errorName());
	utext_close(regex1);
	utext_close(matchthis);
	utext_close(replacedtxt);
	utext_close(replacement);
	delete matcher;
	exit(exitcode);
	}

	// Yawn... and convert to ascii. Can we make it any more complicated??
	UConverter *conv = ucnv_open("US-ASCII", &status);
	char outcs = (char ) malloc(1000);
	ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status);

	// Also, use function u_strFromUTF8 / u_strToUTF8

	if (U_FAILURE(status)) {
	// Handle any syntax errors in the regular expression here
	printf("Syntax error in extraction:\n");
	icu::ErrorCode ec;
	ec.set(status);
	printf("Error was: %s\n",ec.errorName());
	utext_close(regex1);
	utext_close(matchthis);
	utext_close(replacedtxt);
	utext_close(replacement);
	free(outcs);
	delete matcher;
	exit(exitcode);
	}
	ucnv_close(conv);

	printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8
	// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
	if(outcs) free(outcs);



	utext_close(replacedtxt);
	} else {
	printf("Result: No replacement.\n");
	}

	utext_close(replacement);

	}
	utext_close(regex1);
	utext_close(matchthis);
	delete matcher;
	exit(exitcode);
	}

	/*
	* Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest
	*/