Last active
September 16, 2019 06:47
-
-
Save IndependentClaws/e486c0fbdfeb86b8b66d9b5bd7e9b518 to your computer and use it in GitHub Desktop.
An example of how parsing using regular expression can be used to extract valuable linguistic feature information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@interface RangeWord : NSObject | |
@property (nonatomic,strong)NSString*wordToken; | |
@property NSRange wordRange; | |
-(id)initWith:(NSString*)wordToken andWithRange: (NSRange)rangeInText; | |
@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import <Foundation/Foundation.h> | |
NS_ASSUME_NONNULL_BEGIN | |
@interface RegexParser : NSObject | |
@property (nonatomic,strong) NSString* text; | |
-(id) initWith:(NSString*)text; | |
-(void)getModalsInText:(void(^)(NSString*,NSRange))completion; | |
+(void)getModalsInTokenArray:(NSArray<NSString*>*)tokenArray withCompletionHandler:(void(^)(NSString*,int))completion; | |
@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import "RegexParser.h" | |
#import <NaturalLanguage/NaturalLanguage.h> | |
#import "RangeWord.h" | |
@interface RegexParser () | |
@property (readonly)NLTokenizer* tokenizer; | |
@property (readonly) NSRange fullTextRange; | |
@property (readwrite) bool hasBeenTokenized; | |
@property (nonatomic,strong) NSMutableArray<RangeWord*>* rangeWords; | |
@end | |
@implementation RegexParser | |
static NSString*const modalRegexPattern = @"\\b((sh|c|w)ould((n't)|('ve))*)|(may('ve)*)|(might('ve)*)\\b"; | |
NLTokenizer* _tokenizer; | |
-(id) initWith:(NSString*)text{ | |
self = [super init]; | |
if(self){ | |
_text = text; | |
_rangeWords = [[NSMutableArray alloc] init]; | |
} | |
return self; | |
} | |
- (NSMutableArray<RangeWord *> *)rangeWords{ | |
return _rangeWords; | |
} | |
/** Computed property that returns the range for the user-provided text sample **/ | |
- (NSRange)fullTextRange{ | |
return NSMakeRange(0, _text.length); | |
} | |
/** An instance of NL tokenizer that breaks up a user-provided text into words **/ | |
- (NLTokenizer *)tokenizer{ | |
if(_tokenizer){ | |
return _tokenizer; | |
} else { | |
_tokenizer = [[NLTokenizer alloc] initWithUnit:NLTokenUnitWord]; | |
[_tokenizer setString:_text]; | |
return _tokenizer; | |
} | |
} | |
/** Given a set of tokens, this function will accept a set of tokens (i.e. an array of strings) and pass a modal word along with its index in said array into a completion handler**/ | |
+(void)getModalsInTokenArray:(NSArray<NSString*>*)tokenArray withCompletionHandler:(void(^)(NSString*,int))completion{ | |
[self findModalsIn:tokenArray withPattern:modalRegexPattern andWithCompletionHandler:completion]; | |
} | |
/** Helper function that accepts a set of word tokens (i.e. an array of strings) along with a RegEx pattern and a completion handler that allows the user to determine how to further handle matched words and their index in a given text **/ | |
+(void)findModalsIn:(NSArray<NSString*>*)tokenArray withPattern:(NSString*)pattern andWithCompletionHandler: (void(^)(NSString*,int))completion{ | |
NSRegularExpression* regexExpression = [[NSRegularExpression alloc] initWithPattern:pattern options:NSRegularExpressionCaseInsensitive error:nil]; | |
for(int i=0; i<tokenArray.count; i++){ | |
NSString* token = [tokenArray objectAtIndex:i]; | |
NSTextCheckingResult* result = [regexExpression firstMatchInString:token options:NSMatchingReportCompletion range:NSMakeRange(0, [token length])]; | |
NSRange range = [result range]; | |
if(range.location != NSNotFound && range.length != 0){ | |
completion(token,i); | |
} | |
} | |
} | |
/* This method will tokenize the string and then pass into a closure any matched modals along their corresponding range*/ | |
- (void)getModalsInText:(void(^)(NSString*,NSRange))completion{ | |
if(_hasBeenTokenized){ | |
[self findTokensWith:modalRegexPattern withCompletionHandler:completion]; | |
} else{ | |
[self tokenize]; | |
[self findTokensWith:modalRegexPattern withCompletionHandler:completion]; | |
} | |
} | |
/** Helper function that takes a RegEx pattern and then passes matched words along with their position index in a text sample into a closure **/ | |
-(void)findTokensWith:(NSString*)pattern withCompletionHandler:(void(^)(NSString*,NSRange))completion{ | |
NSRegularExpression* regexExpression = [[NSRegularExpression alloc] initWithPattern:pattern options:NSRegularExpressionCaseInsensitive error:nil]; | |
NSArray<NSTextCheckingResult*>* results = [regexExpression matchesInString:_text options:NSMatchingReportCompletion range:NSMakeRange(0, [_text length])]; | |
for(NSTextCheckingResult*result in results){ | |
NSRange matchedRange = [result range]; | |
NSString* wordToken = [_text substringWithRange:matchedRange]; | |
completion(wordToken,matchedRange); | |
} | |
} | |
/** Helper function that tokenizes a text. This allows for indexing of the words in a text, which allows for calling functions to associate word index with other data, such as linguistic part-of-speech tags **/ | |
-(void) tokenize{ | |
[self.tokenizer enumerateTokensInRange:self.fullTextRange usingBlock:^(NSRange tokenRange, NLTokenizerAttributes flags, BOOL * _Nonnull stop) { | |
NSString* wordToken = [self->_text substringWithRange:tokenRange]; | |
RangeWord* newRangeWord = [[RangeWord alloc] initWith:wordToken andWithRange:tokenRange]; | |
[self->_rangeWords addObject:newRangeWord]; | |
}]; | |
_hasBeenTokenized = YES; | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment