Last active
July 19, 2017 20:23
-
-
Save cmkilger/21050dd8b6e3d867f6f5789ad6b7c104 to your computer and use it in GitHub Desktop.
Generates a regular expression to match against multiple strings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import <Foundation/Foundation.h> | |
typedef enum { | |
CaseInsensitive = 1 << 0, | |
DiacraticInsensitive = 1 << 1, | |
CanEarlyTerminate = 1 << 2, | |
} Options; | |
@interface NSString (Diacritics) | |
@end | |
@implementation NSString (Diacritics) | |
- (NSArray *)diacritics { | |
NSString * normalized = [self decomposedStringWithCanonicalMapping]; | |
if (![self isEqualToString:normalized] && [normalized length] > 0) { | |
return @[self, [normalized substringToIndex:1]]; | |
} | |
return @[self]; | |
} | |
@end | |
@interface SubstringNode : NSObject | |
@property (assign) BOOL terminates; | |
@property (strong) NSMutableDictionary * children; | |
@end | |
@implementation SubstringNode | |
- (id)init { | |
self = [super init]; | |
if (self) { | |
_children = [[NSMutableDictionary alloc] init]; | |
} | |
return self; | |
} | |
- (id)initWithWords:(NSArray *)words options:(Options)options { | |
self = [self init]; | |
if (self) { | |
for (NSString * word in words) { | |
[self addWord:word options:options]; | |
} | |
[self optimizeWithOptions:options]; | |
} | |
return self; | |
} | |
- (void)addWord:(NSString *)word options:(Options)options { | |
if ([word length] == 0) { | |
self.terminates = YES; | |
} | |
else { | |
NSRange range = [word rangeOfComposedCharacterSequenceAtIndex:0]; | |
NSString * character = [word substringWithRange:range]; | |
character = [NSRegularExpression escapedPatternForString:character]; | |
if (options & CaseInsensitive) | |
character = [character lowercaseString]; | |
NSArray * keys = nil; | |
if (options & DiacraticInsensitive) { | |
keys = [character diacritics]; | |
} | |
else { | |
keys = @[character]; | |
} | |
for (NSString * key in keys) { | |
SubstringNode * next = [self.children objectForKey:key]; | |
if (!next) { | |
next = [[SubstringNode alloc] init]; | |
[self.children setObject:next forKey:key]; | |
} | |
[next addWord:[word substringFromIndex:range.length] options:options]; | |
} | |
} | |
} | |
- (void)optimizeWithOptions:(Options)options { | |
if ([self.children count] > 1) { | |
__block BOOL equalChildren = YES; | |
SubstringNode * child = [[self.children allValues] lastObject]; | |
NSString * regex = [child regularExpressionWithOptions:options]; | |
NSMutableString * newKey = [[NSMutableString alloc] init]; | |
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop) { | |
if (![regex isEqualToString:[obj regularExpressionWithOptions:options]] || [obj terminates] != child.terminates) { | |
equalChildren = NO; | |
*stop = YES; | |
} | |
[newKey appendString:key]; | |
}]; | |
if (equalChildren) { | |
self.children = [NSMutableDictionary dictionaryWithObject:child forKey:[[NSString alloc] initWithFormat:@"[%@]", newKey]]; | |
} | |
} | |
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop) { | |
[obj optimizeWithOptions:options]; | |
}]; | |
} | |
- (NSString *)description { | |
NSMutableString * output = [NSMutableString string]; | |
if (self.terminates) | |
[output appendString:@"**\n"]; | |
NSRegularExpression * regex = [NSRegularExpression regularExpressionWithPattern:@"^.*$" options:NSRegularExpressionAnchorsMatchLines error:nil]; | |
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){ | |
NSString * temp = [obj description]; | |
[output appendFormat:@"%@:\n", key]; | |
[regex enumerateMatchesInString:temp options:0 range:NSMakeRange(0, [temp length]) usingBlock:^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop){ | |
[output appendFormat:@"| %@\n", [temp substringWithRange:[result rangeAtIndex:0]]]; | |
}]; | |
}]; | |
return output; | |
} | |
- (NSString *)regularExpressionWithOptions:(Options)options { | |
if (!self.terminates && [self.children count] == 1) { | |
return [NSString stringWithFormat:@"%@%@", [[self.children allKeys] lastObject], [[[self.children allValues] lastObject] regularExpressionWithOptions:options]]; | |
} | |
else if (!self.terminates) { | |
NSMutableString * regex = [NSMutableString string]; | |
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){ | |
if ([regex length] > 0) | |
[regex appendString:@"|"]; | |
[regex appendFormat:@"%@%@", key, [obj regularExpressionWithOptions:options]]; | |
}]; | |
return [NSString stringWithFormat:@"(%@)", regex]; | |
} | |
else if ([self.children count] > 0 && !(options & CanEarlyTerminate)) { | |
NSMutableString * regex = [NSMutableString string]; | |
[self.children enumerateKeysAndObjectsUsingBlock:^(id key, id obj, BOOL *stop){ | |
if ([regex length] > 0) | |
[regex appendString:@"|"]; | |
[regex appendFormat:@"%@%@", key, [obj regularExpressionWithOptions:options]]; | |
}]; | |
return [NSString stringWithFormat:@"(%@)?", regex]; | |
} | |
return @""; | |
} | |
+ (NSString *)regularExpressionWithWords:(NSArray *)words options:(Options)options { | |
SubstringNode * root = [[SubstringNode alloc] initWithWords:words options:options]; | |
return [root regularExpressionWithOptions:options]; | |
} | |
@end | |
int main(int argc, char *argv[]) { | |
@autoreleasepool { | |
NSLog(@"%@", [SubstringNode regularExpressionWithWords:@[@"Hello", @"hi", @"world"] options:CaseInsensitive]); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment