Skip to content

Instantly share code, notes, and snippets.

@cmkilger
Last active July 19, 2017 20:23
Show Gist options
  • Save cmkilger/21050dd8b6e3d867f6f5789ad6b7c104 to your computer and use it in GitHub Desktop.
Save cmkilger/21050dd8b6e3d867f6f5789ad6b7c104 to your computer and use it in GitHub Desktop.
Generates a regular expression to match against multiple strings.
#import <Foundation/Foundation.h>
typedef enum {
CaseInsensitive = 1 << 0,
DiacraticInsensitive = 1 << 1,
CanEarlyTerminate = 1 << 2,
} Options;
@interface NSString (Diacritics)
@end
@implementation NSString (Diacritics)
- (NSArray *)diacritics {
NSString * normalized = [self decomposedStringWithCanonicalMapping];
if (![self isEqualToString:normalized] && [normalized length] > 0) {
return @[self, [normalized substringToIndex:1]];
}
return @[self];
}
@end
@interface SubstringNode : NSObject
@property (assign) BOOL terminates;
@property (strong) NSMutableDictionary * children;
@end
@implementation SubstringNode
- (id)init {
self = [super init];
if (self) {
_children = [[NSMutableDictionary alloc] init];
}
return self;
}
- (id)initWithWords:(NSArray *)words options:(Options)options {
self = [self init];
if (self) {
for (NSString * word in words) {
[self addWord:word options:options];
}
[self optimizeWithOptions:options];
}
return self;
}
- (void)addWord:(NSString *)word options:(Options)options {
if ([word length] == 0) {
self.terminates = YES;
}
else {
NSRange range = [word rangeOfComposedCharacterSequenceAtIndex:0];
NSString * character = [word substringWithRange:range];
character = [NSRegularExpression escapedPatternForString:character];
if (options & CaseInsensitive)
character = [character lowercaseString];
NSArray * keys = nil;
if (options & DiacraticInsensitive) {
keys = [character diacritics];
}
else {
keys = @[character];
}
for (NSString * key in keys) {
SubstringNode * next = [self.children objectForKey:key];
if (!next) {
next = [[SubstringNode alloc] init];
[self.children setObject:next forKey:key];
}
[next addWord:[word substringFromIndex:range.length] options:options];
}
}
}
- (void)optimizeWithOptions:(Options)options {
if ([self.children count] > 1) {
__block BOOL equalChildren = YES;
SubstringNode * child = [[self.children allValues] lastObject];
NSString * regex = [child regularExpressionWithOptions:options];
NSMutableString * newKey = [[NSMutableString alloc] init];
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop) {
if (![regex isEqualToString:[obj regularExpressionWithOptions:options]] || [obj terminates] != child.terminates) {
equalChildren = NO;
*stop = YES;
}
[newKey appendString:key];
}];
if (equalChildren) {
self.children = [NSMutableDictionary dictionaryWithObject:child forKey:[[NSString alloc] initWithFormat:@"[%@]", newKey]];
}
}
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop) {
[obj optimizeWithOptions:options];
}];
}
- (NSString *)description {
NSMutableString * output = [NSMutableString string];
if (self.terminates)
[output appendString:@"**\n"];
NSRegularExpression * regex = [NSRegularExpression regularExpressionWithPattern:@"^.*$" options:NSRegularExpressionAnchorsMatchLines error:nil];
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){
NSString * temp = [obj description];
[output appendFormat:@"%@:\n", key];
[regex enumerateMatchesInString:temp options:0 range:NSMakeRange(0, [temp length]) usingBlock:^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop){
[output appendFormat:@"| %@\n", [temp substringWithRange:[result rangeAtIndex:0]]];
}];
}];
return output;
}
- (NSString *)regularExpressionWithOptions:(Options)options {
if (!self.terminates && [self.children count] == 1) {
return [NSString stringWithFormat:@"%@%@", [[self.children allKeys] lastObject], [[[self.children allValues] lastObject] regularExpressionWithOptions:options]];
}
else if (!self.terminates) {
NSMutableString * regex = [NSMutableString string];
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){
if ([regex length] > 0)
[regex appendString:@"|"];
[regex appendFormat:@"%@%@", key, [obj regularExpressionWithOptions:options]];
}];
return [NSString stringWithFormat:@"(%@)", regex];
}
else if ([self.children count] > 0 && !(options & CanEarlyTerminate)) {
NSMutableString * regex = [NSMutableString string];
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){
if ([regex length] > 0)
[regex appendString:@"|"];
[regex appendFormat:@"%@%@", key, [obj regularExpressionWithOptions:options]];
}];
return [NSString stringWithFormat:@"(%@)?", regex];
}
return @"";
}
+ (NSString *)regularExpressionWithWords:(NSArray *)words options:(Options)options {
SubstringNode * root = [[SubstringNode alloc] initWithWords:words options:options];
return [root regularExpressionWithOptions:options];
}
@end
int main(int argc, char *argv[]) {
@autoreleasepool {
NSLog(@"%@", [SubstringNode regularExpressionWithWords:@[@"Hello", @"hi", @"world"] options:CaseInsensitive]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment