Skip to content

Instantly share code, notes, and snippets.

@Techmind
Created October 26, 2011 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Techmind/1315813 to your computer and use it in GitHub Desktop.
Save Techmind/1315813 to your computer and use it in GitHub Desktop.
riak search ru stemmer
-module(ru_stem).
-export([
standard_analyzer_factory/2
]).
-define(UPPERCHAR(C), (C >= $A andalso C =< $Z) orelse (C >= 1040 andalso C =< 1071)).
-define(LOWERCHAR(C), (C >= $a andalso C =< $z) orelse (C >= 1072 andalso C =< 1103)).
-define(NUMBER(C), (C >= $0 andalso C =< $9)).
-define(WHITESPACE(C), ((C == $\s) orelse (C == $\n) orelse (C == $\t) orelse (C == $\f) orelse (C == $\r) orelse (C == $\v))).
%% @doc Tokenize incoming text using roughly the same rules as the
%% StandardAnalyzerFactory in Lucene/Java.
standard_analyzer_factory(Text, [MinLengthArg]) ->
MinLength = list_to_integer(MinLengthArg),
{ok, standard(unicode:characters_to_list(Text), MinLength, [], [])};
standard_analyzer_factory(Text, _Other) ->
{ok, standard(unicode:characters_to_list(Text), 3, [], [])}.
standard([H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
H1 = H + 32,
standard(T, MinLength, [H1|Acc], ResultAcc);
standard([H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
standard(T, MinLength, [H|Acc], ResultAcc);
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
H1 = H + 32,
standard(T, MinLength, [H1,$.|Acc], ResultAcc);
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
standard(T, MinLength, [H,$.|Acc], ResultAcc);
standard([_X |T], MinLength, Acc, ResultAcc) ->
standard_termify(T, MinLength, Acc, ResultAcc);
standard([], MinLength, Acc, ResultAcc) ->
standard_termify([], MinLength, Acc, ResultAcc).
%% Determine if this term is valid, if so, add it to the list we are
%% generating.
standard_termify([], _MinLength, [], ResultAcc) ->
lists:reverse(ResultAcc);
standard_termify(T, MinLength, [], ResultAcc) ->
standard(T, MinLength, [], ResultAcc);
standard_termify(T, MinLength, Acc, ResultAcc) when length(Acc) < MinLength ->
%% mimic org.apache.lucene.analysis.LengthFilter,
%% which does not incement position index
standard(T, MinLength, [], ResultAcc);
standard_termify(T, MinLength, Acc, ResultAcc) ->
Term = lists:reverse(Acc),
case is_russian_stopword(Term) of
false ->
RuList = ru_stem(Term),
%io:format("~ts~n", [RuList]),
TermBinary = unicode:characters_to_binary(RuList),
NewResultAcc = [TermBinary|ResultAcc];
true ->
NewResultAcc = [skip|ResultAcc]
end,
standard(T, MinLength, [], NewResultAcc).
is_russian_stopword(Term) when length(Term) == 1 ->
ordsets:is_element(Term, ["а","я","у","с","о","к","и","в"]);
is_russian_stopword(Term) when length(Term) == 2 ->
ordsets:is_element(Term, ["мы","ты","вы","да","до","ее","ей","ею","же","за","из","им","их","ко","ли","во","на","не","ни","но","ну","об","он","от","по","бы","со","те","то"]);
is_russian_stopword(Term) when length(Term) == 3 ->
ordsets:is_element(Term, ["без","чья","еще","вас","вам","наш","где","нет","нее","них","эти","для","был","или","чье","вот","она","они","оно","его","что","как","под","при","все","эта","так","чем","чей","там","кто","тем","это","той","том","мне","уже"]);
is_russian_stopword(Term) when length(Term) == 4 ->
ordsets:is_element(Term, ["есть","него","было","весь","либо","всех","даже","того","тоже","была","чего","быть","если","хотя","были","надо"]);
is_russian_stopword(Term) when length(Term) == 5 ->
ordsets:is_element(Term, ["более","такой","также","очень","чтобы","всего","может","когда","здесь"]);
is_russian_stopword(Term) when length(Term) == 6 ->
ordsets:is_element(Term, ["только","однако"]);
is_russian_stopword(_Term) ->
false.
is_stopword(Term) when length(Term) == 2 ->
ordsets:is_element(Term, ["an", "as", "at", "be", "by", "if", "in", "is", "it", "no", "of", "on", "or", "to"]);
is_stopword(Term) when length(Term) == 3 ->
ordsets:is_element(Term, ["and", "are", "but", "for", "not", "the", "was"]);
is_stopword(Term) when length(Term) == 4 ->
ordsets:is_element(Term, ["into", "such", "that", "then", "they", "this", "will"]);
is_stopword(Term) when length(Term) == 5 ->
ordsets:is_element(Term, ["their", "there", "these"]);
is_stopword(_Term) ->
false.
is_vowel(Char) ->
lists:member(Char, %%"аеиоуыэюя"
[1072,1077,1080,1086,1091,1099,1101,1102,1103])
.
ru_stem(Word) ->
{Start, End} = stemmer_split(Word, fun(X) -> is_vowel(X) end ),
if length(End) > 0 ->
%io:format('1~ts~n', [End]),
%% step 1
Word2 = drop_perfective(End),
%io:format('2ZZ~ts~n', [Word2]),
%% step 2 ["и"])
{_, Word3} = drop_endings(Word2, [[1080]]),
%io:format('3ZZ~ts~n', [Word3]),
%% step 3
Word4 = drop_derivitional(Word3),
%io:format('4ZZ~ts~n', [Word4]),
%% step4 %% ["ь"]
{Step4, Word5} = drop_endings(Word4, [[1100]]),
%io:format('5ZZ~ts~n', [Word5]),
WordLast =
if (Step4 == true) ->
Word5;
true ->
%%"ейше?"
Word6 = drop_anywhere(Word5, <<208,181,208,185,209,136,208,181,63>>),
%% "нн"
%io:format('6ZZ~ts~n', [Word6]),
case drop_endings(Word6, [[1085,1085]]) of
{true, Word7} ->
lists:append(Word7, [1085]);
{false, _Word7} ->
Word6
end
end,
lists:append(Start, WordLast);
true ->
Word
end
.
min_slog_count(Word, MinCount) ->
min_slog_count(Word, MinCount, 0)
.
drop_anywhere(Word, Regexp) ->
IoList = unicode:characters_to_binary(Word),
%io:format("~w~w~n", [IoList, Regexp]),
BinList = re:replace(IoList, Regexp, ""),
if length(BinList) == length(IoList) ->
unicode:characters_to_list(BinList);
true ->
Word
end
.
min_slog_count([ A | [B | Rest]], MinCount, Acc) ->
if (Acc >= MinCount) ->
Acc;
true ->
case {is_vowel(A), is_vowel(B)} of
{true, false} ->
min_slog_count(Rest, MinCount, Acc+1);
{_, true} ->
min_slog_count([B | Rest], MinCount, Acc);
{false, false} ->
min_slog_count(Rest, MinCount, Acc)
end
end
;
min_slog_count([_Rest], _MinCount, Acc) ->
Acc
;
min_slog_count([], _MinCount, Acc) ->
Acc
.
drop_derivitional(Word) ->
%% ends with [^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$/
%% два слога (согласная+гласная буквы) заканчивающиеся на ость или ост
%% ["ость", "ост"]
case ends_with(Word, [[1086,1089,1090,1100],[1086,1089,1090]]) of
false ->
Word;
Suffix ->
WithoutSuffix = lists:sublist(Word, 1, length(Word) - length(Suffix)),
%% проверить, что есть минмум два слога
HaveSlogCount = min_slog_count(Word, 2),
if (HaveSlogCount == 2) ->
WithoutSuffix;
true ->
Word
end
end
.
drop_groups(Word, FullGroup, PartGroup, Length) ->
case drop_endings(Word, FullGroup) of
{false, _} ->
drop_endings_lengthed(Word, PartGroup, Length);
{true, Word2} ->
{true, Word2}
end
.
drop_perfective(Word) ->
%% try delete var $PERFECTIVEGROUND = '/((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$/';
PerfectiveGroundEndings = [[1080,1074],
[1080,1074,1096,1080],
[1080,1074,1096,1080,1089,1100],
[1099,1074],
[1099,1074,1096,1080],
[1099,1074,1096,1080,1089,1100]], %%["ив", "ивши", "ившись", "ыв", "ывши", "ывшись"],
PerfectiveGroundEndings2 = [[1072,1074],
[1072,1074,1096,1080],
[1072,1074,1096,1080,1089,1100],
[1103,1074],
[1103,1074,1096,1080],
[1103,1074,1096,1080,1089,1100]], %%["ав", "авши", "авшись", "яв", "явши", "явшись"],
%% without "[ая]"
{PerfectiveDeleted, Word2} = drop_groups(Word, PerfectiveGroundEndings, PerfectiveGroundEndings2, 1),
%io:format('XYY~ts~n', [Word2]),
if (PerfectiveDeleted) ->
Word2;
true ->
{_DroppedRefelective, Word3} = drop_reflective(Word2),
%io:format('REF~ts~n', [Word3]),
{DroppedAdjective, Word4} = drop_adjectibe(Word3),
%io:format('ADJ~ts~n', [Word4]),
if DroppedAdjective ->
{_DroppedParticiple, Word5} = drop_participle(Word4),
Word5;
true ->
case drop_verb(Word4) of
{false, Word5} ->
%io:format('VERB~ts~n', [Word5]),
{_DroppedNoun, Word6} = drop_noun(Word5),
%io:format('NOUN~ts~n', [Word6]),
Word6;
{true, Word5} ->
%io:format('VERB~ts~n', [Word5]),
Word5
end
end
end
.
drop_noun(Word) ->
drop_endings(Word,
[[1072],
[1077,1074],
[1086,1074],
[1080,1077],
[1100,1077],
[1077],
[1080,1103,1084,1080],
[1103,1084,1080],
[1072,1084,1080],
[1077,1080],
[1080,1080],
[1080],
[1080,1077,1081],
[1077,1081],
[1086,1081],
[1080,1081],
[1081],
[1080,1103,1084],
[1103,1084],
[1080,1077,1084],
[1077,1084],
[1072,1084],
[1086,1084],
[1086],
[1091],
[1072,1093],
[1080,1103,1093],
[1103,1093],
[1099],
[1100],
[1080,1102],
[1100,1102],
[1102],
[1080,1103],
[1100,1103],
[1103]]
%%["а","ев","ов","ие","ье","е","иями","ями","ами","еи","ии","и","ией","ей","ой","ий","й","иям","ям","ием","ем","ам","ом","о","у","ах","иях","ях","ы","ь","ию","ью","ю","ия","ья","я"]
)
.
drop_reflective(Word) ->
drop_endings(Word, [[1089,1103],[1089,1100]] %%["ся", "сь"]
)
.
drop_adjectibe(Word) ->
drop_endings(Word,
[[1077,1077],
[1080,1077],
[1099,1077],
[1086,1077],
[1080,1084,1080],
[1099,1084,1080],
[1077,1081],
[1080,1081],
[1099,1081],
[1086,1081],
[1077,1084],
[1080,1084],
[1099,1084],
[1086,1084],
[1077,1075,1086],
[1086,1075,1086],
[1077,1084,1091],
[1086,1084,1091],
[1080,1093],
[1099,1093],
[1091,1102],
[1102,1102],
[1072,1103],
[1103,1103],
[1086,1102],
[1077,1102]]
%%["ее","ие","ые","ое","ими","ыми","ей","ий","ый","ой","ем","им","ым","ом","его","ого","ему","ому","их","ых","ую","юю","ая","яя","ою","ею"]
)
.
drop_participle(Word) ->
Endings = [[1080,1074,1096],[1099,1074,1096],[1091,1102,1097]], %%["ивш", "ывш", "ующ"],
Endings2 =
[[1072,1077,1084],
[1072,1085,1085],
[1072,1074,1096],
[1072,1102,1097],
[1072,1097],
[1103,1077,1084],
[1103,1085,1085],
[1103,1074,1096],
[1103,1102,1097],
[1103,1097]],
%%["аем", "анн", "авш", "ающ", "ащ", "яем", "янн", "явш", "яющ", "ящ"],
drop_groups(Word, Endings, Endings2, 1)
.
drop_verb(Word) ->
Endings =
[[1080,1083,1072],
[1099,1083,1072],
[1077,1085,1072],
[1077,1081,1090,1077],
[1091,1081,1090,1077],
[1080,1090,1077],
[1080,1083,1080],
[1099,1083,1080],
[1077,1081],
[1091,1081],
[1080,1083],
[1099,1083],
[1080,1084],
[1099,1084],
[1077,1085],
[1080,1083,1086],
[1099,1083,1086],
[1077,1085,1086],
[1103,1090],
[1091,1077,1090],
[1091,1102,1090],
[1080,1090],
[1099,1090],
[1077,1085,1099],
[1080,1090,1100],
[1099,1090,1100],
[1080,1096,1100],
[1091,1102],
[1102]],
%%["ила","ыла","ена","ейте","уйте","ите","или","ыли","ей","уй","ил","ыл","им","ым","ен","ило","ыло","ено","ят","ует","уют","ит","ыт","ены","ить","ыть","ишь","ую","ю"],
Endings2 = [[1072,1083,1072],
[1072,1085,1072],
[1072,1077,1090,1077],
[1072,1081,1090,1077],
[1072,1083,1080],
[1072,1081],
[1072,1083],
[1072,1077,1084],
[1072,1085],
[1072,1083,1086],
[1072,1085,1086],
[1072,1077,1090],
[1072,1102,1090],
[1072,1085,1099],
[1072,1090,1100],
[1072,1077,1096,1100],
[1072,1085,1085,1086],
[1103,1083,1072],
[1103,1085,1072],
[1103,1077,1090,1077],
[1103,1081,1090,1077],
[1103,1083,1080],
[1103,1081],
[1103,1083],
[1103,1077,1084],
[1103,1085],
[1103,1083,1086],
[1103,1085,1086],
[1103,1077,1090],
[1103,1102,1090],
[1103,1085,1099],
[1103,1090,1100],
[1103,1077,1096,1100],
[1103,1085,1085,1086]], %%["ала","ана","аете","айте","али","ай","ал","аем","ан","ало","ано","ает","ают","аны","ать","аешь","анно", "яла","яна","яете","яйте","яли","яй","ял","яем","ян","яло","яно","яет","яют","яны","ять","яешь","янно"],
drop_groups(Word, Endings, Endings2, 1)
.
drop_endings(Word, Endings) ->
drop_endings_lengthed(Word, Endings, 0)
.
drop_endings_lengthed(Word, Endings, Length) ->
case ends_with(Word, Endings) of
false ->
{false, Word};
Suffix ->
%io:format('END~ts~w~n', [Suffix, length(Word) - length(Suffix) - Length]),
{true, lists:sublist(Word, 1, length(Word) - length(Suffix) - Length)}
end
.
ends_with(Word, [Suffix | PossibleLists]) when length(Word) >= length(Suffix) ->
IsSuffix = lists:suffix(Suffix, Word),
if (IsSuffix == true) ->
Suffix;
true ->
ends_with(Word, PossibleLists)
end
;
ends_with(Word, [_Suffix | PossibleLists]) ->
ends_with(Word, PossibleLists)
;
ends_with(_Word, []) ->
false
.
stemmer_split(Word, Fun) ->
stemmer_split_inner(Word, Fun, [])
.
stemmer_split_inner([Char | Rest], Fun, Acc) ->
InList = Fun(Char),
%io:fwrite("~ts ~ts~w~n", [[Char], Rest, InList]),
if InList == true ->
{lists:reverse([Char | Acc]), Rest};
true ->
stemmer_split_inner(Rest, Fun, [Char | Acc])
end
;
stemmer_split_inner([], _Fun, Acc) ->
{Acc, []}
.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment