Skip to content

Instantly share code, notes, and snippets.

@delebedev
Created November 6, 2012 11:58
Show Gist options
  • Save delebedev/4024250 to your computer and use it in GitHub Desktop.
Save delebedev/4024250 to your computer and use it in GitHub Desktop.
Porter's stemmer for Russian language (Objective-C)
//
// PorterStemmer.h
// PorterStemmer
//
// Created by Denis Lebedev on 11/5/12.
// Copyright (c) 2012 Denis Lebedev. All rights reserved.
//
#import <Foundation/Foundation.h>
@interface PorterStemmer : NSObject
- (NSString *)stem:(NSString *)word;
@end
//
// PorterStemmer.m
// PorterStemmer
//
// Created by Denis Lebedev on 11/5/12.
// Copyright (c) 2012 Denis Lebedev. All rights reserved.
//
#import "PorterStemmer.h"
@interface NSRegularExpression (Additions)
- (NSString *)replaceFirstInString:(NSString *)input withString:(NSString *)replace;
- (BOOL)matchesWith:(NSString *)input;
@end
@implementation NSRegularExpression (Additions)
- (NSString *)replaceFirstInString:(NSString *)input withString:(NSString *)replace {
NSTextCheckingResult *res = [self firstMatchInString:input options:NSMatchingReportCompletion range:NSMakeRange(0, input.length)];
if (res.range.location == NSNotFound) {
return input;
}
return [input stringByReplacingCharactersInRange:res.range withString:replace];
}
- (BOOL)matchesWith:(NSString *)input {
return [self numberOfMatchesInString:input
options:NSMatchingWithTransparentBounds
range:NSMakeRange(0, input.length)];
}
@end
@interface PorterStemmer ()
@property (nonatomic, strong) NSRegularExpression *perfectiveGround;
@property (nonatomic, strong) NSRegularExpression *reflexive;
@property (nonatomic, strong) NSRegularExpression *adjective;
@property (nonatomic, strong) NSRegularExpression *participle;
@property (nonatomic, strong) NSRegularExpression *verb;
@property (nonatomic, strong) NSRegularExpression *noun;
@property (nonatomic, strong) NSRegularExpression *rvre;
@property (nonatomic, strong) NSRegularExpression *derivational;
@property (nonatomic, strong) NSRegularExpression *der;
@property (nonatomic, strong) NSRegularExpression *superlative;
@property (nonatomic, strong) NSRegularExpression *i;
@property (nonatomic, strong) NSRegularExpression *p;
@property (nonatomic, strong) NSRegularExpression *nn;
@end
@implementation PorterStemmer
@synthesize perfectiveGround = _perfectiveGround;
@synthesize reflexive = _reflexive;
@synthesize adjective = _adjective;
@synthesize i = _i;
@synthesize p = _p;
@synthesize nn = _nn;
- (NSRegularExpression *)perfectiveGround {
if (!_perfectiveGround) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись))$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.perfectiveGround = regex;
}
return _perfectiveGround;
}
- (NSRegularExpression *)reflexive {
if (!_reflexive) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"с[яь]$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.reflexive = regex;
}
return _reflexive;
}
- (NSRegularExpression *)adjective {
if (!_adjective) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.adjective = regex;
}
return _adjective;
}
- (NSRegularExpression *)participle {
if (!_participle) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.participle = regex;
}
return _participle;
}
- (NSRegularExpression *)verb {
if (!_verb) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.verb = regex;
}
return _verb;
}
- (NSRegularExpression *)noun {
if (!_noun) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.noun = regex;
}
return _noun;
}
- (NSRegularExpression *)rvre {
if (!_rvre) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"^(.*?[аеиоуыэюя])(.*)$"
options:NSRegularExpressionAnchorsMatchLines
error:&error];
assert(!error);
self.rvre = regex;
}
return _rvre;
}
- (NSRegularExpression *)derivational {
if (!_derivational) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.derivational = regex;
}
return _derivational;
}
- (NSRegularExpression *)der {
if (!_der) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"ость?$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.der = regex;
}
return _der;
}
- (NSRegularExpression *)superlative {
if (!_superlative) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"(ейше|ейш)$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.superlative = regex;
}
return _superlative;
}
- (NSRegularExpression *)i {
if (!_i) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"и$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.i = regex;
}
return _i;
}
- (NSRegularExpression *)p {
if (!_p) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"ь$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.p = regex;
}
return _p;
}
- (NSRegularExpression *)nn {
if (!_nn) {
NSError *error = NULL;
NSRegularExpression *regex = [NSRegularExpression regularExpressionWithPattern:@"нн$"
options:NSRegularExpressionCaseInsensitive
error:&error];
assert(!error);
self.nn = regex;
}
return _nn;
}
- (NSString *)stem:(NSString *)word {
word = [[word lowercaseString] stringByReplacingOccurrencesOfString:@"ё" withString:@"e"];
if ([self.rvre matchesWith:word]) {
NSArray *matches = [self.rvre matchesInString:word
options:0
range:NSMakeRange(0, [word length])];
NSString *pre;
NSString *rv;
for (NSTextCheckingResult *match in matches) {
pre = [word substringWithRange:[match rangeAtIndex:1]];
rv = [word substringWithRange:[match rangeAtIndex:2]];
}
NSString *temp = [self.perfectiveGround replaceFirstInString:rv withString:@""];
if ([temp isEqualToString:rv]) {
rv = [self.reflexive replaceFirstInString:rv withString:@""];
temp = [self.adjective replaceFirstInString:rv withString:@""];
if (![temp isEqualToString:rv]) {
rv = temp;
rv = [self.participle replaceFirstInString:rv withString:@""];
} else {
temp = [self.verb replaceFirstInString:rv withString:@""];
if ([temp isEqualToString:rv]) {
rv = [self.noun replaceFirstInString:rv withString:@""];
} else {
rv = temp;
}
}
} else {
rv = temp;
}
rv = [self.i replaceFirstInString:rv withString:@""];
if ([self.derivational matchesWith:rv]) {
rv = [self.der replaceFirstInString:rv withString:@""];
}
temp = [self.p replaceFirstInString:rv withString:@""];
if ([temp isEqualToString:rv]) {
rv = [self.superlative replaceFirstInString:rv withString:@""];
rv = [self.nn replaceFirstInString:rv withString:@""];
} else {
rv = temp;
}
word = [pre stringByAppendingString:rv];
}
return word;
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment