Created
November 6, 2012 11:58
-
-
Save delebedev/4024250 to your computer and use it in GitHub Desktop.
Porter's stemmer for Russian language (Objective-C)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// PorterStemmer.h | |
// PorterStemmer | |
// | |
// Created by Denis Lebedev on 11/5/12. | |
// Copyright (c) 2012 Denis Lebedev. All rights reserved. | |
// | |
#import <Foundation/Foundation.h> | |
@interface PorterStemmer : NSObject | |
- (NSString *)stem:(NSString *)word; | |
@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// PorterStemmer.m | |
// PorterStemmer | |
// | |
// Created by Denis Lebedev on 11/5/12. | |
// Copyright (c) 2012 Denis Lebedev. All rights reserved. | |
// | |
#import "PorterStemmer.h" | |
@interface NSRegularExpression (Additions) | |
- (NSString *)replaceFirstInString:(NSString *)input withString:(NSString *)replace; | |
- (BOOL)matchesWith:(NSString *)input; | |
@end | |
@implementation NSRegularExpression (Additions) | |
- (NSString *)replaceFirstInString:(NSString *)input withString:(NSString *)replace { | |
NSTextCheckingResult *res = [self firstMatchInString:input options:NSMatchingReportCompletion range:NSMakeRange(0, input.length)]; | |
if (res.range.location == NSNotFound) { | |
return input; | |
} | |
return [input stringByReplacingCharactersInRange:res.range withString:replace]; | |
} | |
- (BOOL)matchesWith:(NSString *)input { | |
return [self numberOfMatchesInString:input | |
options:NSMatchingWithTransparentBounds | |
range:NSMakeRange(0, input.length)]; | |
} | |
@end | |
@interface PorterStemmer () | |
@property (nonatomic, strong) NSRegularExpression *perfectiveGround; | |
@property (nonatomic, strong) NSRegularExpression *reflexive; | |
@property (nonatomic, strong) NSRegularExpression *adjective; | |
@property (nonatomic, strong) NSRegularExpression *participle; | |
@property (nonatomic, strong) NSRegularExpression *verb; | |
@property (nonatomic, strong) NSRegularExpression *noun; | |
@property (nonatomic, strong) NSRegularExpression *rvre; | |
@property (nonatomic, strong) NSRegularExpression *derivational; | |
@property (nonatomic, strong) NSRegularExpression *der; | |
@property (nonatomic, strong) NSRegularExpression *superlative; | |
@property (nonatomic, strong) NSRegularExpression *i; | |
@property (nonatomic, strong) NSRegularExpression *p; | |
@property (nonatomic, strong) NSRegularExpression *nn; | |
@end | |
@implementation PorterStemmer | |
@synthesize perfectiveGround = _perfectiveGround; | |
@synthesize reflexive = _reflexive; | |
@synthesize adjective = _adjective; | |
@synthesize i = _i; | |
@synthesize p = _p; | |
@synthesize nn = _nn; | |
- (NSRegularExpression *)perfectiveGround { | |
if (!_perfectiveGround) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись))$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.perfectiveGround = regex; | |
} | |
return _perfectiveGround; | |
} | |
- (NSRegularExpression *)reflexive { | |
if (!_reflexive) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"с[яь]$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.reflexive = regex; | |
} | |
return _reflexive; | |
} | |
- (NSRegularExpression *)adjective { | |
if (!_adjective) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.adjective = regex; | |
} | |
return _adjective; | |
} | |
- (NSRegularExpression *)participle { | |
if (!_participle) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.participle = regex; | |
} | |
return _participle; | |
} | |
- (NSRegularExpression *)verb { | |
if (!_verb) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.verb = regex; | |
} | |
return _verb; | |
} | |
- (NSRegularExpression *)noun { | |
if (!_noun) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.noun = regex; | |
} | |
return _noun; | |
} | |
- (NSRegularExpression *)rvre { | |
if (!_rvre) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"^(.*?[аеиоуыэюя])(.*)$" | |
options:NSRegularExpressionAnchorsMatchLines | |
error:&error]; | |
assert(!error); | |
self.rvre = regex; | |
} | |
return _rvre; | |
} | |
- (NSRegularExpression *)derivational { | |
if (!_derivational) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.derivational = regex; | |
} | |
return _derivational; | |
} | |
- (NSRegularExpression *)der { | |
if (!_der) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"ость?$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.der = regex; | |
} | |
return _der; | |
} | |
- (NSRegularExpression *)superlative { | |
if (!_superlative) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ейше|ейш)$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.superlative = regex; | |
} | |
return _superlative; | |
} | |
- (NSRegularExpression *)i { | |
if (!_i) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"и$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.i = regex; | |
} | |
return _i; | |
} | |
- (NSRegularExpression *)p { | |
if (!_p) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"ь$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.p = regex; | |
} | |
return _p; | |
} | |
- (NSRegularExpression *)nn { | |
if (!_nn) { | |
NSError *error = NULL; | |
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"нн$" | |
options:NSRegularExpressionCaseInsensitive | |
error:&error]; | |
assert(!error); | |
self.nn = regex; | |
} | |
return _nn; | |
} | |
- (NSString *)stem:(NSString *)word { | |
word = [[word lowercaseString] stringByReplacingOccurrencesOfString:@"ё" withString:@"e"]; | |
if ([self.rvre matchesWith:word]) { | |
NSArray *matches = [self.rvre matchesInString:word | |
options:0 | |
range:NSMakeRange(0, [word length])]; | |
NSString *pre; | |
NSString *rv; | |
for (NSTextCheckingResult *match in matches) { | |
pre = [word substringWithRange:[match rangeAtIndex:1]]; | |
rv = [word substringWithRange:[match rangeAtIndex:2]]; | |
} | |
NSString *temp = [self.perfectiveGround replaceFirstInString:rv withString:@""]; | |
if ([temp isEqualToString:rv]) { | |
rv = [self.reflexive replaceFirstInString:rv withString:@""]; | |
temp = [self.adjective replaceFirstInString:rv withString:@""]; | |
if (![temp isEqualToString:rv]) { | |
rv = temp; | |
rv = [self.participle replaceFirstInString:rv withString:@""]; | |
} else { | |
temp = [self.verb replaceFirstInString:rv withString:@""]; | |
if ([temp isEqualToString:rv]) { | |
rv = [self.noun replaceFirstInString:rv withString:@""]; | |
} else { | |
rv = temp; | |
} | |
} | |
} else { | |
rv = temp; | |
} | |
rv = [self.i replaceFirstInString:rv withString:@""]; | |
if ([self.derivational matchesWith:rv]) { | |
rv = [self.der replaceFirstInString:rv withString:@""]; | |
} | |
temp = [self.p replaceFirstInString:rv withString:@""]; | |
if ([temp isEqualToString:rv]) { | |
rv = [self.superlative replaceFirstInString:rv withString:@""]; | |
rv = [self.nn replaceFirstInString:rv withString:@""]; | |
} else { | |
rv = temp; | |
} | |
word = [pre stringByAppendingString:rv]; | |
} | |
return word; | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment