Skip to content

Instantly share code, notes, and snippets.

@edhemphill
Created February 3, 2012 18:35
Show Gist options
  • Save edhemphill/1731633 to your computer and use it in GitHub Desktop.
Save edhemphill/1731633 to your computer and use it in GitHub Desktop.
ICU4C regex example test program w/ capture groups
// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/
// The following code is freeware:
// regextest.c
// Author: ed
//
// Simple test program for regex using ICU's regex matching.
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#ifdef __cplusplus
#include <cstdlib>
#endif
#include <unicode/regex.h>
#include <unicode/utext.h>
#include <unicode/errorcode.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>
static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n"
"-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n"
"You need to escape these items like this: \\\" and \\$ b/c of bash.\n"
"\n";
/* ICU library libs/programs...
* http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html
* http://icu-project.org/download/4.4.html#ICU4C
*
*/
int main(int argc, char *argv[]) {
int exitcode = 1;
bool replace_mode=false;
int c;
int digit_optind = 0;
while (1)
{
c = getopt (argc, argv, "r");
if (c == -1)
break;
switch (c)
{
case 'r':
replace_mode = true;
break;
}
}
if(argc-optind < 2) {
printf("%s",OptionsInfo);
exit(1);
}
if(replace_mode && argc-optind < 3) {
printf("%s",OptionsInfo);
exit(1);
}
printf("pattern: -->[%s]<--\n", argv[optind]);
printf("test on: -->[%s]<--\n", argv[optind+1]);
UErrorCode status = U_ZERO_ERROR;
UText *regex1 = NULL;
UText *matchthis = NULL;
// static const char *regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.*)\"";
regex1 = utext_openUTF8(regex1, argv[optind], -1, &status);
//regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status);
matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status);
RegexMatcher *matcher = new RegexMatcher(regex1, 0, status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in regex?\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
delete matcher;
exit(exitcode);
}
if(!replace_mode) { // MATCH TEST
// UnicodeString stringToTest = "Find the abc in this string";
matcher->reset(matchthis);
if (matcher->matches(status)) {
// We found a match.
printf("MATCH: string matches regex\n");
// int startOfMatch = matcher->start(status); // string index of start of match.
if(matcher->groupCount() > 0) {
UConverter *conv = ucnv_open("US-ASCII", &status);
for(int x=0;x<=matcher->groupCount();x++) {
// UText *grp = NULL;
UnicodeString US = matcher->group(x, status);
if (U_FAILURE(status)) {
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
} else {
UChar *out = (UChar *) malloc(1000);
char *outcs = (char *) malloc(1000);
US.extract(out,1000,status);
ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
if(out) free(out);
if(outcs) free(outcs);
}
}
}
exitcode = 0;
} else {
printf("FAIL: no match\n");
exitcode= 2;
}
} else { // REPLACER TEST
UText *replacedtxt = NULL;
UText *replacement = NULL;
replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status);
matcher->reset(matchthis);
printf("replacement: -->[%s]<--\n", argv[optind+2]);
replacedtxt = matcher->replaceAll(replacement,replacedtxt,status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in regex?\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacement);
delete matcher;
exit(exitcode);
}
if(replacedtxt) {
// Replacement did something...
printf("REPLACE: string replaced...\n");
// int startOfMatch = matcher->start(status); // string index of start of match.
UChar buf[500];
printf("new length: %ld\n", utext_nativeLength(replacedtxt));
utext_extract(replacedtxt,0,500,buf,500,&status);
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in extraction:\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacedtxt);
utext_close(replacement);
delete matcher;
exit(exitcode);
}
// Yawn... and convert to ascii. Can we make it any more complicated??
UConverter *conv = ucnv_open("US-ASCII", &status);
char *outcs = (char *) malloc(1000);
ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status);
// Also, use function u_strFromUTF8 / u_strToUTF8
if (U_FAILURE(status)) {
// Handle any syntax errors in the regular expression here
printf("Syntax error in extraction:\n");
icu::ErrorCode ec;
ec.set(status);
printf("Error was: %s\n",ec.errorName());
utext_close(regex1);
utext_close(matchthis);
utext_close(replacedtxt);
utext_close(replacement);
free(outcs);
delete matcher;
exit(exitcode);
}
ucnv_close(conv);
printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
if(outcs) free(outcs);
utext_close(replacedtxt);
} else {
printf("Result: No replacement.\n");
}
utext_close(replacement);
}
utext_close(regex1);
utext_close(matchthis);
delete matcher;
exit(exitcode);
}
/*
* Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest
*/
@dothebart
Copy link

Adding some QD code to use the split command:

// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/
// The following code is freeware:
// regextest.c
// Author: ed
//
// Simple test program for regex using ICU's regex matching.

#include <stdio.h>
#include <getopt.h>
#include <string.h>
#ifdef __cplusplus
#include <cstdlib>
#endif
#include <unicode/regex.h>
#include <unicode/utext.h>
#include <unicode/errorcode.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/uchar.h>

static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n"
  "-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n"
  "You need to escape these items like this: \\\" and \\$ b/c of bash.\n"
  "\n";

/* ICU library libs/programs...
 * http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html
 * http://icu-project.org/download/4.4.html#ICU4C
 *
 */

int main(int argc, char *argv[]) {
  int exitcode = 1;
  bool replace_mode=false;
  bool split_mode = false;
  int c;
  int digit_optind = 0;

  while (1) {
    c = getopt (argc, argv, "rs:");
    if (c == -1) {
      printf("none.\n");
      break;
    }
    printf(" => %c\n", c);
    switch (c)
      {
      case 'r':
        replace_mode = true;
        break;
      case 's':
        split_mode = true;
        break;
      }
  }

  if(argc-optind < 2  && !split_mode) {
    printf("%s\n1\n%d - \n %d \n",OptionsInfo, argc-optind, split_mode);
    exit(1);
  }
  if((replace_mode && !split_mode) && argc-optind < 3) {
    printf("%s\n2",OptionsInfo);
    exit(1);
  }
  printf("optind: %d\n", optind);
  printf("pattern:     -->[%s]<--\n", argv[optind]);
  printf("test on:     -->[%s]<--\n", argv[optind+1]);

  UErrorCode        status    = U_ZERO_ERROR;
  UText *regex1 = NULL;
  UText *matchthis = NULL;

  //	static const char *regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.*)\"";

  regex1 = utext_openUTF8(regex1, argv[optind], -1, &status);
  //regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status);
  matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status);

  RegexMatcher *matcher = new RegexMatcher(regex1, 0, status);
  if (U_FAILURE(status)) {
    // Handle any syntax errors in the regular expression here
    printf("Syntax error in regex?\n");
    icu::ErrorCode ec;
    ec.set(status);
    printf("Error was: %s\n",ec.errorName());
    utext_close(regex1);
    utext_close(matchthis);
    delete matcher;
    exit(exitcode);
  }

  if (split_mode) {
    UnicodeString valueToSplit(argv[optind+1]);
    static const uint16_t nrResults = 16;
    UnicodeString uResults[nrResults];
    int64_t totalCount = 0;
    while (true) {
      UErrorCode errorCode = U_ZERO_ERROR;
            
      auto uCount = matcher->split(valueToSplit, uResults, nrResults, status);
      printf("uCount: %d\n", uCount);
      if (U_FAILURE(status)) {
        // Handle any syntax errors in the regular expression here
        printf("Syntax error in regex?\n");
        icu::ErrorCode ec;
        ec.set(status);
        printf("Error was: %s\n",ec.errorName());
        utext_close(regex1);
        utext_close(matchthis);
        delete matcher;
        exit(exitcode);
      }
      uint16_t copyThisTime = uCount;

      // todoo errrorCode
      if (copyThisTime > nrResults) {
        // last hit is the remaining string:
        copyThisTime --;
      }
      std::string utf8;
      UConverter *conv = ucnv_open("US-ASCII", &status);
      for (int64_t x = 0; x < copyThisTime; x++) {
        if (U_FAILURE(status)) {
          icu::ErrorCode ec;
          ec.set(status);
          printf("Error was: %s\n",ec.errorName());
        } else {
          UChar *out = (UChar *) malloc(1000);
          char *outcs = (char *) malloc(1000);
          uResults[x].extract(out,1000,status);
          ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
          printf("Split match %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
          //uResults[x].toUTF8String(utf8);
          //printf("Split match %d: -->[%s]<--\n", x, utf8.c_str()); // works as long as UTF8
          //printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
          //utf8.clear();
          if(out) free(out);
          if(outcs) free(outcs);
        }
      }
      if (uCount < nrResults) {
        break;
      }
      // ok, we have more in the last slot, retry:
      valueToSplit = uResults[nrResults - 1];
    }
  }
  else if(!replace_mode) { // MATCH TEST

    //	UnicodeString    stringToTest = "Find the abc in this string";
    matcher->reset(matchthis);

    if (matcher->matches(status)) {
      // We found a match.
      printf("MATCH: string matches regex\n");
      //	   int startOfMatch = matcher->start(status);   // string index of start of match.

      if(matcher->groupCount() > 0) {
        UConverter *conv = ucnv_open("US-ASCII", &status);
        for(int x=0;x<=matcher->groupCount();x++) {
          //				UText *grp = NULL;
          UnicodeString US = matcher->group(x, status);
          if (U_FAILURE(status)) {
            icu::ErrorCode ec;
            ec.set(status);
            printf("Error was: %s\n",ec.errorName());
          } else {
            UChar *out = (UChar *) malloc(1000);
            char *outcs = (char *) malloc(1000);
            US.extract(out,1000,status);
            ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status);
            printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8
            //					printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
            if(out) free(out);
            if(outcs) free(outcs);
          }
        }
      }

      exitcode = 0;
    } else {
      printf("FAIL: no match\n");
      exitcode= 2;
    }

  } else { // REPLACER TEST
    UText *replacedtxt = NULL;
    UText *replacement = NULL;
    replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status);
    matcher->reset(matchthis);
    printf("replacement: -->[%s]<--\n", argv[optind+2]);

    replacedtxt = matcher->replaceAll(replacement,replacedtxt,status);
    if (U_FAILURE(status)) {
      // Handle any syntax errors in the regular expression here
      printf("Syntax error in regex?\n");
      icu::ErrorCode ec;
      ec.set(status);
      printf("Error was: %s\n",ec.errorName());
      utext_close(regex1);
      utext_close(matchthis);
      utext_close(replacement);
      delete matcher;
      exit(exitcode);
    }

    if(replacedtxt) {
      // Replacement did something...
      printf("REPLACE: string replaced...\n");
      //	   int startOfMatch = matcher->start(status);   // string index of start of match.
      UChar buf[500];
      printf("new length: %ld\n", utext_nativeLength(replacedtxt));
      utext_extract(replacedtxt,0,500,buf,500,&status);

      if (U_FAILURE(status)) {
        // Handle any syntax errors in the regular expression here
        printf("Syntax error in extraction:\n");
        icu::ErrorCode ec;
        ec.set(status);
        printf("Error was: %s\n",ec.errorName());
        utext_close(regex1);
        utext_close(matchthis);
        utext_close(replacedtxt);
        utext_close(replacement);
        delete matcher;
        exit(exitcode);
      }

      // Yawn... and convert to ascii.  Can we make it any more complicated??
      UConverter *conv = ucnv_open("US-ASCII", &status);
      char *outcs = (char *) malloc(1000);
      ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status);

      // Also, use function u_strFromUTF8 / u_strToUTF8

      if (U_FAILURE(status)) {
        // Handle any syntax errors in the regular expression here
        printf("Syntax error in extraction:\n");
        icu::ErrorCode ec;
        ec.set(status);
        printf("Error was: %s\n",ec.errorName());
        utext_close(regex1);
        utext_close(matchthis);
        utext_close(replacedtxt);
        utext_close(replacement);
        free(outcs);
        delete matcher;
        exit(exitcode);
      }
      ucnv_close(conv);

      printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8
      //					printf("Capture group %d: %s\n", x, grp); // works as long as UTF8
      if(outcs) free(outcs);



      utext_close(replacedtxt);
    } else {
      printf("Result: No replacement.\n");
    }

    utext_close(replacement);

  }
  utext_close(regex1);
  utext_close(matchthis);
  delete matcher;
  exit(exitcode);
}

/*
 *   Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest
 */

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment