Created
February 3, 2012 18:35
-
-
Save edhemphill/1731633 to your computer and use it in GitHub Desktop.
ICU4C regex example test program w/ capture groups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// http://www.wigwag.com/devblog/using-icu4c-regex-test-program/ | |
// The following code is freeware: | |
// regextest.c | |
// Author: ed | |
// | |
// Simple test program for regex using ICU's regex matching. | |
#include <stdio.h> | |
#include <getopt.h> | |
#include <string.h> | |
#ifdef __cplusplus | |
#include <cstdlib> | |
#endif | |
#include <unicode/regex.h> | |
#include <unicode/utext.h> | |
#include <unicode/errorcode.h> | |
#include <unicode/ucnv.h> | |
#include <unicode/utypes.h> | |
#include <unicode/uchar.h> | |
static const char OptionsInfo[] = "Usage: regextest [-r] \"PATTERN\" \"TEST STRING\"\n" | |
"-r : replace mode -> \"PATTERN\" \"TEST STRING\" \"replacement\"\n" | |
"You need to escape these items like this: \\\" and \\$ b/c of bash.\n" | |
"\n"; | |
/* ICU library libs/programs... | |
* http://www.linuxfromscratch.org/blfs/view/svn/general/icu.html | |
* http://icu-project.org/download/4.4.html#ICU4C | |
* | |
*/ | |
int main(int argc, char *argv[]) { | |
int exitcode = 1; | |
bool replace_mode=false; | |
int c; | |
int digit_optind = 0; | |
while (1) | |
{ | |
c = getopt (argc, argv, "r"); | |
if (c == -1) | |
break; | |
switch (c) | |
{ | |
case 'r': | |
replace_mode = true; | |
break; | |
} | |
} | |
if(argc-optind < 2) { | |
printf("%s",OptionsInfo); | |
exit(1); | |
} | |
if(replace_mode && argc-optind < 3) { | |
printf("%s",OptionsInfo); | |
exit(1); | |
} | |
printf("pattern: -->[%s]<--\n", argv[optind]); | |
printf("test on: -->[%s]<--\n", argv[optind+1]); | |
UErrorCode status = U_ZERO_ERROR; | |
UText *regex1 = NULL; | |
UText *matchthis = NULL; | |
// static const char *regex_validate_string = "(?:cp\\:([0-9]+)\\:){0,1}\"(.*)\""; | |
regex1 = utext_openUTF8(regex1, argv[optind], -1, &status); | |
//regex1 = utext_openUTF8(regex1, regex_validate_string, -1, &status); | |
matchthis = utext_openUTF8(matchthis, argv[optind+1], -1, &status); | |
RegexMatcher *matcher = new RegexMatcher(regex1, 0, status); | |
if (U_FAILURE(status)) { | |
// Handle any syntax errors in the regular expression here | |
printf("Syntax error in regex?\n"); | |
icu::ErrorCode ec; | |
ec.set(status); | |
printf("Error was: %s\n",ec.errorName()); | |
utext_close(regex1); | |
utext_close(matchthis); | |
delete matcher; | |
exit(exitcode); | |
} | |
if(!replace_mode) { // MATCH TEST | |
// UnicodeString stringToTest = "Find the abc in this string"; | |
matcher->reset(matchthis); | |
if (matcher->matches(status)) { | |
// We found a match. | |
printf("MATCH: string matches regex\n"); | |
// int startOfMatch = matcher->start(status); // string index of start of match. | |
if(matcher->groupCount() > 0) { | |
UConverter *conv = ucnv_open("US-ASCII", &status); | |
for(int x=0;x<=matcher->groupCount();x++) { | |
// UText *grp = NULL; | |
UnicodeString US = matcher->group(x, status); | |
if (U_FAILURE(status)) { | |
icu::ErrorCode ec; | |
ec.set(status); | |
printf("Error was: %s\n",ec.errorName()); | |
} else { | |
UChar *out = (UChar *) malloc(1000); | |
char *outcs = (char *) malloc(1000); | |
US.extract(out,1000,status); | |
ucnv_fromUChars(conv,outcs,1000,out,u_strlen(out),&status); | |
printf("Capture group %d: -->[%s]<--\n", x, outcs); // works as long as UTF8 | |
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8 | |
if(out) free(out); | |
if(outcs) free(outcs); | |
} | |
} | |
} | |
exitcode = 0; | |
} else { | |
printf("FAIL: no match\n"); | |
exitcode= 2; | |
} | |
} else { // REPLACER TEST | |
UText *replacedtxt = NULL; | |
UText *replacement = NULL; | |
replacement = utext_openUTF8(replacement, argv[optind+2], -1, &status); | |
matcher->reset(matchthis); | |
printf("replacement: -->[%s]<--\n", argv[optind+2]); | |
replacedtxt = matcher->replaceAll(replacement,replacedtxt,status); | |
if (U_FAILURE(status)) { | |
// Handle any syntax errors in the regular expression here | |
printf("Syntax error in regex?\n"); | |
icu::ErrorCode ec; | |
ec.set(status); | |
printf("Error was: %s\n",ec.errorName()); | |
utext_close(regex1); | |
utext_close(matchthis); | |
utext_close(replacement); | |
delete matcher; | |
exit(exitcode); | |
} | |
if(replacedtxt) { | |
// Replacement did something... | |
printf("REPLACE: string replaced...\n"); | |
// int startOfMatch = matcher->start(status); // string index of start of match. | |
UChar buf[500]; | |
printf("new length: %ld\n", utext_nativeLength(replacedtxt)); | |
utext_extract(replacedtxt,0,500,buf,500,&status); | |
if (U_FAILURE(status)) { | |
// Handle any syntax errors in the regular expression here | |
printf("Syntax error in extraction:\n"); | |
icu::ErrorCode ec; | |
ec.set(status); | |
printf("Error was: %s\n",ec.errorName()); | |
utext_close(regex1); | |
utext_close(matchthis); | |
utext_close(replacedtxt); | |
utext_close(replacement); | |
delete matcher; | |
exit(exitcode); | |
} | |
// Yawn... and convert to ascii. Can we make it any more complicated?? | |
UConverter *conv = ucnv_open("US-ASCII", &status); | |
char *outcs = (char *) malloc(1000); | |
ucnv_fromUChars(conv,outcs,1000,buf,u_strlen(buf),&status); | |
// Also, use function u_strFromUTF8 / u_strToUTF8 | |
if (U_FAILURE(status)) { | |
// Handle any syntax errors in the regular expression here | |
printf("Syntax error in extraction:\n"); | |
icu::ErrorCode ec; | |
ec.set(status); | |
printf("Error was: %s\n",ec.errorName()); | |
utext_close(regex1); | |
utext_close(matchthis); | |
utext_close(replacedtxt); | |
utext_close(replacement); | |
free(outcs); | |
delete matcher; | |
exit(exitcode); | |
} | |
ucnv_close(conv); | |
printf("Replaced result: -->[%s]<--\n", outcs); // works as long as UTF8 | |
// printf("Capture group %d: %s\n", x, grp); // works as long as UTF8 | |
if(outcs) free(outcs); | |
utext_close(replacedtxt); | |
} else { | |
printf("Result: No replacement.\n"); | |
} | |
utext_close(replacement); | |
} | |
utext_close(regex1); | |
utext_close(matchthis); | |
delete matcher; | |
exit(exitcode); | |
} | |
/* | |
* Build: gcc regextest.cpp -licui18n -licuuc -licudata -o regextest | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Adding some QD code to use the split command: