Created
October 26, 2011 08:49
-
-
Save Techmind/1315813 to your computer and use it in GitHub Desktop.
riak search ru stemmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(ru_stem). | |
-export([ | |
standard_analyzer_factory/2 | |
]). | |
-define(UPPERCHAR(C), (C >= $A andalso C =< $Z) orelse (C >= 1040 andalso C =< 1071)). | |
-define(LOWERCHAR(C), (C >= $a andalso C =< $z) orelse (C >= 1072 andalso C =< 1103)). | |
-define(NUMBER(C), (C >= $0 andalso C =< $9)). | |
-define(WHITESPACE(C), ((C == $\s) orelse (C == $\n) orelse (C == $\t) orelse (C == $\f) orelse (C == $\r) orelse (C == $\v))). | |
%% @doc Tokenize incoming text using roughly the same rules as the | |
%% StandardAnalyzerFactory in Lucene/Java. | |
standard_analyzer_factory(Text, [MinLengthArg]) -> | |
MinLength = list_to_integer(MinLengthArg), | |
{ok, standard(unicode:characters_to_list(Text), MinLength, [], [])}; | |
standard_analyzer_factory(Text, _Other) -> | |
{ok, standard(unicode:characters_to_list(Text), 3, [], [])}. | |
standard([H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) -> | |
H1 = H + 32, | |
standard(T, MinLength, [H1|Acc], ResultAcc); | |
standard([H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) -> | |
standard(T, MinLength, [H|Acc], ResultAcc); | |
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) -> | |
H1 = H + 32, | |
standard(T, MinLength, [H1,$.|Acc], ResultAcc); | |
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) -> | |
standard(T, MinLength, [H,$.|Acc], ResultAcc); | |
standard([_X |T], MinLength, Acc, ResultAcc) -> | |
standard_termify(T, MinLength, Acc, ResultAcc); | |
standard([], MinLength, Acc, ResultAcc) -> | |
standard_termify([], MinLength, Acc, ResultAcc). | |
%% Determine if this term is valid, if so, add it to the list we are | |
%% generating. | |
standard_termify([], _MinLength, [], ResultAcc) -> | |
lists:reverse(ResultAcc); | |
standard_termify(T, MinLength, [], ResultAcc) -> | |
standard(T, MinLength, [], ResultAcc); | |
standard_termify(T, MinLength, Acc, ResultAcc) when length(Acc) < MinLength -> | |
%% mimic org.apache.lucene.analysis.LengthFilter, | |
%% which does not incement position index | |
standard(T, MinLength, [], ResultAcc); | |
standard_termify(T, MinLength, Acc, ResultAcc) -> | |
Term = lists:reverse(Acc), | |
case is_russian_stopword(Term) of | |
false -> | |
RuList = ru_stem(Term), | |
%io:format("~ts~n", [RuList]), | |
TermBinary = unicode:characters_to_binary(RuList), | |
NewResultAcc = [TermBinary|ResultAcc]; | |
true -> | |
NewResultAcc = [skip|ResultAcc] | |
end, | |
standard(T, MinLength, [], NewResultAcc). | |
is_russian_stopword(Term) when length(Term) == 1 -> | |
ordsets:is_element(Term, ["а","я","у","с","о","к","и","в"]); | |
is_russian_stopword(Term) when length(Term) == 2 -> | |
ordsets:is_element(Term, ["мы","ты","вы","да","до","ее","ей","ею","же","за","из","им","их","ко","ли","во","на","не","ни","но","ну","об","он","от","по","бы","со","те","то"]); | |
is_russian_stopword(Term) when length(Term) == 3 -> | |
ordsets:is_element(Term, ["без","чья","еще","вас","вам","наш","где","нет","нее","них","эти","для","был","или","чье","вот","она","они","оно","его","что","как","под","при","все","эта","так","чем","чей","там","кто","тем","это","той","том","мне","уже"]); | |
is_russian_stopword(Term) when length(Term) == 4 -> | |
ordsets:is_element(Term, ["есть","него","было","весь","либо","всех","даже","того","тоже","была","чего","быть","если","хотя","были","надо"]); | |
is_russian_stopword(Term) when length(Term) == 5 -> | |
ordsets:is_element(Term, ["более","такой","также","очень","чтобы","всего","может","когда","здесь"]); | |
is_russian_stopword(Term) when length(Term) == 6 -> | |
ordsets:is_element(Term, ["только","однако"]); | |
is_russian_stopword(_Term) -> | |
false. | |
is_stopword(Term) when length(Term) == 2 -> | |
ordsets:is_element(Term, ["an", "as", "at", "be", "by", "if", "in", "is", "it", "no", "of", "on", "or", "to"]); | |
is_stopword(Term) when length(Term) == 3 -> | |
ordsets:is_element(Term, ["and", "are", "but", "for", "not", "the", "was"]); | |
is_stopword(Term) when length(Term) == 4 -> | |
ordsets:is_element(Term, ["into", "such", "that", "then", "they", "this", "will"]); | |
is_stopword(Term) when length(Term) == 5 -> | |
ordsets:is_element(Term, ["their", "there", "these"]); | |
is_stopword(_Term) -> | |
false. | |
is_vowel(Char) -> | |
lists:member(Char, %%"аеиоуыэюя" | |
[1072,1077,1080,1086,1091,1099,1101,1102,1103]) | |
. | |
ru_stem(Word) -> | |
{Start, End} = stemmer_split(Word, fun(X) -> is_vowel(X) end ), | |
if length(End) > 0 -> | |
%io:format('1~ts~n', [End]), | |
%% step 1 | |
Word2 = drop_perfective(End), | |
%io:format('2ZZ~ts~n', [Word2]), | |
%% step 2 ["и"]) | |
{_, Word3} = drop_endings(Word2, [[1080]]), | |
%io:format('3ZZ~ts~n', [Word3]), | |
%% step 3 | |
Word4 = drop_derivitional(Word3), | |
%io:format('4ZZ~ts~n', [Word4]), | |
%% step4 %% ["ь"] | |
{Step4, Word5} = drop_endings(Word4, [[1100]]), | |
%io:format('5ZZ~ts~n', [Word5]), | |
WordLast = | |
if (Step4 == true) -> | |
Word5; | |
true -> | |
%%"ейше?" | |
Word6 = drop_anywhere(Word5, <<208,181,208,185,209,136,208,181,63>>), | |
%% "нн" | |
%io:format('6ZZ~ts~n', [Word6]), | |
case drop_endings(Word6, [[1085,1085]]) of | |
{true, Word7} -> | |
lists:append(Word7, [1085]); | |
{false, _Word7} -> | |
Word6 | |
end | |
end, | |
lists:append(Start, WordLast); | |
true -> | |
Word | |
end | |
. | |
min_slog_count(Word, MinCount) -> | |
min_slog_count(Word, MinCount, 0) | |
. | |
drop_anywhere(Word, Regexp) -> | |
IoList = unicode:characters_to_binary(Word), | |
%io:format("~w~w~n", [IoList, Regexp]), | |
BinList = re:replace(IoList, Regexp, ""), | |
if length(BinList) == length(IoList) -> | |
unicode:characters_to_list(BinList); | |
true -> | |
Word | |
end | |
. | |
min_slog_count([ A | [B | Rest]], MinCount, Acc) -> | |
if (Acc >= MinCount) -> | |
Acc; | |
true -> | |
case {is_vowel(A), is_vowel(B)} of | |
{true, false} -> | |
min_slog_count(Rest, MinCount, Acc+1); | |
{_, true} -> | |
min_slog_count([B | Rest], MinCount, Acc); | |
{false, false} -> | |
min_slog_count(Rest, MinCount, Acc) | |
end | |
end | |
; | |
min_slog_count([_Rest], _MinCount, Acc) -> | |
Acc | |
; | |
min_slog_count([], _MinCount, Acc) -> | |
Acc | |
. | |
drop_derivitional(Word) -> | |
%% ends with [^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$/ | |
%% два слога (согласная+гласная буквы) заканчивающиеся на ость или ост | |
%% ["ость", "ост"] | |
case ends_with(Word, [[1086,1089,1090,1100],[1086,1089,1090]]) of | |
false -> | |
Word; | |
Suffix -> | |
WithoutSuffix = lists:sublist(Word, 1, length(Word) - length(Suffix)), | |
%% проверить, что есть минмум два слога | |
HaveSlogCount = min_slog_count(Word, 2), | |
if (HaveSlogCount == 2) -> | |
WithoutSuffix; | |
true -> | |
Word | |
end | |
end | |
. | |
drop_groups(Word, FullGroup, PartGroup, Length) -> | |
case drop_endings(Word, FullGroup) of | |
{false, _} -> | |
drop_endings_lengthed(Word, PartGroup, Length); | |
{true, Word2} -> | |
{true, Word2} | |
end | |
. | |
drop_perfective(Word) -> | |
%% try delete var $PERFECTIVEGROUND = '/((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$/'; | |
PerfectiveGroundEndings = [[1080,1074], | |
[1080,1074,1096,1080], | |
[1080,1074,1096,1080,1089,1100], | |
[1099,1074], | |
[1099,1074,1096,1080], | |
[1099,1074,1096,1080,1089,1100]], %%["ив", "ивши", "ившись", "ыв", "ывши", "ывшись"], | |
PerfectiveGroundEndings2 = [[1072,1074], | |
[1072,1074,1096,1080], | |
[1072,1074,1096,1080,1089,1100], | |
[1103,1074], | |
[1103,1074,1096,1080], | |
[1103,1074,1096,1080,1089,1100]], %%["ав", "авши", "авшись", "яв", "явши", "явшись"], | |
%% without "[ая]" | |
{PerfectiveDeleted, Word2} = drop_groups(Word, PerfectiveGroundEndings, PerfectiveGroundEndings2, 1), | |
%io:format('XYY~ts~n', [Word2]), | |
if (PerfectiveDeleted) -> | |
Word2; | |
true -> | |
{_DroppedRefelective, Word3} = drop_reflective(Word2), | |
%io:format('REF~ts~n', [Word3]), | |
{DroppedAdjective, Word4} = drop_adjectibe(Word3), | |
%io:format('ADJ~ts~n', [Word4]), | |
if DroppedAdjective -> | |
{_DroppedParticiple, Word5} = drop_participle(Word4), | |
Word5; | |
true -> | |
case drop_verb(Word4) of | |
{false, Word5} -> | |
%io:format('VERB~ts~n', [Word5]), | |
{_DroppedNoun, Word6} = drop_noun(Word5), | |
%io:format('NOUN~ts~n', [Word6]), | |
Word6; | |
{true, Word5} -> | |
%io:format('VERB~ts~n', [Word5]), | |
Word5 | |
end | |
end | |
end | |
. | |
drop_noun(Word) -> | |
drop_endings(Word, | |
[[1072], | |
[1077,1074], | |
[1086,1074], | |
[1080,1077], | |
[1100,1077], | |
[1077], | |
[1080,1103,1084,1080], | |
[1103,1084,1080], | |
[1072,1084,1080], | |
[1077,1080], | |
[1080,1080], | |
[1080], | |
[1080,1077,1081], | |
[1077,1081], | |
[1086,1081], | |
[1080,1081], | |
[1081], | |
[1080,1103,1084], | |
[1103,1084], | |
[1080,1077,1084], | |
[1077,1084], | |
[1072,1084], | |
[1086,1084], | |
[1086], | |
[1091], | |
[1072,1093], | |
[1080,1103,1093], | |
[1103,1093], | |
[1099], | |
[1100], | |
[1080,1102], | |
[1100,1102], | |
[1102], | |
[1080,1103], | |
[1100,1103], | |
[1103]] | |
%%["а","ев","ов","ие","ье","е","иями","ями","ами","еи","ии","и","ией","ей","ой","ий","й","иям","ям","ием","ем","ам","ом","о","у","ах","иях","ях","ы","ь","ию","ью","ю","ия","ья","я"] | |
) | |
. | |
drop_reflective(Word) -> | |
drop_endings(Word, [[1089,1103],[1089,1100]] %%["ся", "сь"] | |
) | |
. | |
drop_adjectibe(Word) -> | |
drop_endings(Word, | |
[[1077,1077], | |
[1080,1077], | |
[1099,1077], | |
[1086,1077], | |
[1080,1084,1080], | |
[1099,1084,1080], | |
[1077,1081], | |
[1080,1081], | |
[1099,1081], | |
[1086,1081], | |
[1077,1084], | |
[1080,1084], | |
[1099,1084], | |
[1086,1084], | |
[1077,1075,1086], | |
[1086,1075,1086], | |
[1077,1084,1091], | |
[1086,1084,1091], | |
[1080,1093], | |
[1099,1093], | |
[1091,1102], | |
[1102,1102], | |
[1072,1103], | |
[1103,1103], | |
[1086,1102], | |
[1077,1102]] | |
%%["ее","ие","ые","ое","ими","ыми","ей","ий","ый","ой","ем","им","ым","ом","его","ого","ему","ому","их","ых","ую","юю","ая","яя","ою","ею"] | |
) | |
. | |
drop_participle(Word) -> | |
Endings = [[1080,1074,1096],[1099,1074,1096],[1091,1102,1097]], %%["ивш", "ывш", "ующ"], | |
Endings2 = | |
[[1072,1077,1084], | |
[1072,1085,1085], | |
[1072,1074,1096], | |
[1072,1102,1097], | |
[1072,1097], | |
[1103,1077,1084], | |
[1103,1085,1085], | |
[1103,1074,1096], | |
[1103,1102,1097], | |
[1103,1097]], | |
%%["аем", "анн", "авш", "ающ", "ащ", "яем", "янн", "явш", "яющ", "ящ"], | |
drop_groups(Word, Endings, Endings2, 1) | |
. | |
drop_verb(Word) -> | |
Endings = | |
[[1080,1083,1072], | |
[1099,1083,1072], | |
[1077,1085,1072], | |
[1077,1081,1090,1077], | |
[1091,1081,1090,1077], | |
[1080,1090,1077], | |
[1080,1083,1080], | |
[1099,1083,1080], | |
[1077,1081], | |
[1091,1081], | |
[1080,1083], | |
[1099,1083], | |
[1080,1084], | |
[1099,1084], | |
[1077,1085], | |
[1080,1083,1086], | |
[1099,1083,1086], | |
[1077,1085,1086], | |
[1103,1090], | |
[1091,1077,1090], | |
[1091,1102,1090], | |
[1080,1090], | |
[1099,1090], | |
[1077,1085,1099], | |
[1080,1090,1100], | |
[1099,1090,1100], | |
[1080,1096,1100], | |
[1091,1102], | |
[1102]], | |
%%["ила","ыла","ена","ейте","уйте","ите","или","ыли","ей","уй","ил","ыл","им","ым","ен","ило","ыло","ено","ят","ует","уют","ит","ыт","ены","ить","ыть","ишь","ую","ю"], | |
Endings2 = [[1072,1083,1072], | |
[1072,1085,1072], | |
[1072,1077,1090,1077], | |
[1072,1081,1090,1077], | |
[1072,1083,1080], | |
[1072,1081], | |
[1072,1083], | |
[1072,1077,1084], | |
[1072,1085], | |
[1072,1083,1086], | |
[1072,1085,1086], | |
[1072,1077,1090], | |
[1072,1102,1090], | |
[1072,1085,1099], | |
[1072,1090,1100], | |
[1072,1077,1096,1100], | |
[1072,1085,1085,1086], | |
[1103,1083,1072], | |
[1103,1085,1072], | |
[1103,1077,1090,1077], | |
[1103,1081,1090,1077], | |
[1103,1083,1080], | |
[1103,1081], | |
[1103,1083], | |
[1103,1077,1084], | |
[1103,1085], | |
[1103,1083,1086], | |
[1103,1085,1086], | |
[1103,1077,1090], | |
[1103,1102,1090], | |
[1103,1085,1099], | |
[1103,1090,1100], | |
[1103,1077,1096,1100], | |
[1103,1085,1085,1086]], %%["ала","ана","аете","айте","али","ай","ал","аем","ан","ало","ано","ает","ают","аны","ать","аешь","анно", "яла","яна","яете","яйте","яли","яй","ял","яем","ян","яло","яно","яет","яют","яны","ять","яешь","янно"], | |
drop_groups(Word, Endings, Endings2, 1) | |
. | |
drop_endings(Word, Endings) -> | |
drop_endings_lengthed(Word, Endings, 0) | |
. | |
drop_endings_lengthed(Word, Endings, Length) -> | |
case ends_with(Word, Endings) of | |
false -> | |
{false, Word}; | |
Suffix -> | |
%io:format('END~ts~w~n', [Suffix, length(Word) - length(Suffix) - Length]), | |
{true, lists:sublist(Word, 1, length(Word) - length(Suffix) - Length)} | |
end | |
. | |
ends_with(Word, [Suffix | PossibleLists]) when length(Word) >= length(Suffix) -> | |
IsSuffix = lists:suffix(Suffix, Word), | |
if (IsSuffix == true) -> | |
Suffix; | |
true -> | |
ends_with(Word, PossibleLists) | |
end | |
; | |
ends_with(Word, [_Suffix | PossibleLists]) -> | |
ends_with(Word, PossibleLists) | |
; | |
ends_with(_Word, []) -> | |
false | |
. | |
stemmer_split(Word, Fun) -> | |
stemmer_split_inner(Word, Fun, []) | |
. | |
stemmer_split_inner([Char | Rest], Fun, Acc) -> | |
InList = Fun(Char), | |
%io:fwrite("~ts ~ts~w~n", [[Char], Rest, InList]), | |
if InList == true -> | |
{lists:reverse([Char | Acc]), Rest}; | |
true -> | |
stemmer_split_inner(Rest, Fun, [Char | Acc]) | |
end | |
; | |
stemmer_split_inner([], _Fun, Acc) -> | |
{Acc, []} | |
. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment