Skip to content

Instantly share code, notes, and snippets.

@gistya
Forked from edhemphill/regextest.cpp
Created January 5, 2016 19:16
Show Gist options
  • Save gistya/3da796c4620462af2236 to your computer and use it in GitHub Desktop.
Save gistya/3da796c4620462af2236 to your computer and use it in GitHub Desktop.
ICU4C regex example test program w/ capture groups
// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/
// The following code is freeware:
// regextest.c
// Author: ed
//
// Simple test program for regex using ICU's regex matching.
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#ifdef __cplusplus
#include <cstdlib>
#endif
#include <unicode/regex.h>
#include <unicode/utext.h>
#include <unicode/errorcode.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>
static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n"
"-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n"
"You need to escape these items like this: \\\" and \\$ b/c of bash.\n"
"\n";
/* ICU library libs/programs...
* http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html
* http://icu-project.org/download/4.4.html#ICU4C
*
*/
int main(int argc, char *argv[]) {
int exitcode = 1;
bool replace_mode=false;
int c;
int digit_optind = 0;
while (1)
{
c = getopt (argc, argv, "r");
if (c == -1)
break;
switch (c)
{
case 'r':
replace_mode = true;
break;
}
}
if(argc-optind < 2) {
printf("%s",OptionsInfo);
exit(1);
}
if(replace_mode && argc-optind < 3) {
printf("%s",OptionsInfo);
exit(1);
}
printf("pattern: -->[%s]<--\n", argv[optind]);
printf("test on: -->[%s]<--\n", argv[optind+1]);
UErrorCode status = U_ZERO_ERROR;
UText *regex1 = NULL;
UText *matchthis = NULL;
// static const char *regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.*)\"";
regex1 = utext_openUTF8(regex1, argv[optind], -1, &status);
//regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status);
matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status);
RegexMatcher *matcher = new RegexMatcher(regex1, 0, status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in regex?\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
delete matcher;
exit(exitcode);
}
if(!replace_mode) { // MATCH TEST
// UnicodeString stringToTest = "Find the abc in this string";
matcher->reset(matchthis);
if (matcher->matches(status)) {
// We found a match.
printf("MATCH: string matches regex\n");
// int startOfMatch = matcher->start(status); // string index of start of match.
if(matcher->groupCount() > 0) {
UConverter *conv = ucnv_open("US-ASCII", &status);
for(int x=0;x<=matcher->groupCount();x++) {
// UText *grp = NULL;
UnicodeString US = matcher->group(x, status);
if (U_FAILURE(status)) {
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
} else {
UChar *out = (UChar *) malloc(1000);
char *outcs = (char *) malloc(1000);
US.extract(out,1000,status);
ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
if(out) free(out);
if(outcs) free(outcs);
}
}
}
exitcode = 0;
} else {
printf("FAIL: no match\n");
exitcode= 2;
}
} else { // REPLACER TEST
UText *replacedtxt = NULL;
UText *replacement = NULL;
replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status);
matcher->reset(matchthis);
printf("replacement: -->[%s]<--\n", argv[optind+2]);
replacedtxt = matcher->replaceAll(replacement,replacedtxt,status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in regex?\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacement);
delete matcher;
exit(exitcode);
}
if(replacedtxt) {
// Replacement did something...
printf("REPLACE: string replaced...\n");
// int startOfMatch = matcher->start(status); // string index of start of match.
UChar buf[500];
printf("new length: %ld\n", utext_nativeLength(replacedtxt));
utext_extract(replacedtxt,0,500,buf,500,&status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in extraction:\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacedtxt);
utext_close(replacement);
delete matcher;
exit(exitcode);
}
// Yawn... and convert to ascii. Can we make it any more complicated??
UConverter *conv = ucnv_open("US-ASCII", &status);
char *outcs = (char *) malloc(1000);
ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status);
// Also, use function u_strFromUTF8 / u_strToUTF8
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in extraction:\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacedtxt);
utext_close(replacement);
free(outcs);
delete matcher;
exit(exitcode);
}
ucnv_close(conv);
printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
if(outcs) free(outcs);
utext_close(replacedtxt);
} else {
printf("Result: No replacement.\n");
}
utext_close(replacement);
}
utext_close(regex1);
utext_close(matchthis);
delete matcher;
exit(exitcode);
}
/*
* Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment