Techmind/gist:1315813

## gistfile1.txt
-module(ru_stem).

-export([
         standard_analyzer_factory/2
        ]).

-define(UPPERCHAR(C),  (C >= $A andalso C =< $Z) orelse (C >= 1040 andalso C =< 1071)).
-define(LOWERCHAR(C),  (C >= $a andalso C =< $z) orelse (C >= 1072 andalso C =< 1103)).
-define(NUMBER(C),     (C >= $0 andalso C =< $9)).
-define(WHITESPACE(C), ((C == $\s) orelse (C == $\n) orelse (C == $\t) orelse (C == $\f) orelse (C == $\r) orelse (C == $\v))).

%% @doc Tokenize incoming text using roughly the same rules as the
%% StandardAnalyzerFactory in Lucene/Java.
standard_analyzer_factory(Text, [MinLengthArg]) ->
    MinLength = list_to_integer(MinLengthArg),
    {ok, standard(unicode:characters_to_list(Text), MinLength, [], [])};
standard_analyzer_factory(Text, _Other) ->
    {ok, standard(unicode:characters_to_list(Text), 3, [], [])}.

standard([H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
    H1 = H + 32,
    standard(T, MinLength, [H1|Acc], ResultAcc);
standard([H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
    standard(T, MinLength, [H|Acc], ResultAcc);
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
    H1 = H + 32,
    standard(T, MinLength, [H1,$.|Acc], ResultAcc);
standard([$.,H | T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
    standard(T, MinLength, [H,$.|Acc], ResultAcc);
standard([_X |T], MinLength, Acc, ResultAcc) ->
    standard_termify(T, MinLength, Acc, ResultAcc);
standard([], MinLength, Acc, ResultAcc) ->
    standard_termify([], MinLength, Acc, ResultAcc).

%% Determine if this term is valid, if so, add it to the list we are
%% generating.
standard_termify([], _MinLength, [], ResultAcc) ->
    lists:reverse(ResultAcc);
standard_termify(T, MinLength, [], ResultAcc) ->
    standard(T, MinLength, [], ResultAcc);
standard_termify(T, MinLength, Acc, ResultAcc) when length(Acc) < MinLength ->
    %% mimic org.apache.lucene.analysis.LengthFilter,
    %% which does not incement position index
    standard(T, MinLength, [], ResultAcc);
standard_termify(T, MinLength, Acc, ResultAcc) ->
    Term = lists:reverse(Acc),
    case is_russian_stopword(Term) of
        false ->
	    RuList = ru_stem(Term),
	    %io:format("~ts~n", [RuList]),
            TermBinary = unicode:characters_to_binary(RuList),
            NewResultAcc = [TermBinary|ResultAcc];
        true ->
            NewResultAcc = [skip|ResultAcc]
    end,
    standard(T, MinLength, [], NewResultAcc).

is_russian_stopword(Term) when length(Term) == 1 ->
    ordsets:is_element(Term, ["а","я","у","с","о","к","и","в"]);
is_russian_stopword(Term) when length(Term) == 2 ->
    ordsets:is_element(Term, ["мы","ты","вы","да","до","ее","ей","ею","же","за","из","им","их","ко","ли","во","на","не","ни","но","ну","об","он","от","по","бы","со","те","то"]);
is_russian_stopword(Term) when length(Term) == 3 ->
    ordsets:is_element(Term, ["без","чья","еще","вас","вам","наш","где","нет","нее","них","эти","для","был","или","чье","вот","она","они","оно","его","что","как","под","при","все","эта","так","чем","чей","там","кто","тем","это","той","том","мне","уже"]);
is_russian_stopword(Term) when length(Term) == 4 ->
    ordsets:is_element(Term, ["есть","него","было","весь","либо","всех","даже","того","тоже","была","чего","быть","если","хотя","были","надо"]);
is_russian_stopword(Term) when length(Term) == 5 ->
    ordsets:is_element(Term, ["более","такой","также","очень","чтобы","всего","может","когда","здесь"]);
is_russian_stopword(Term) when length(Term) == 6 ->
    ordsets:is_element(Term, ["только","однако"]);
is_russian_stopword(_Term) ->
    false.


is_stopword(Term) when length(Term) == 2 ->
    ordsets:is_element(Term, ["an", "as", "at", "be", "by", "if", "in", "is", "it", "no", "of", "on", "or", "to"]);
is_stopword(Term) when length(Term) == 3 ->
    ordsets:is_element(Term, ["and", "are", "but", "for", "not", "the", "was"]);
is_stopword(Term) when length(Term) == 4 ->
    ordsets:is_element(Term, ["into", "such", "that", "then", "they", "this", "will"]);
is_stopword(Term) when length(Term) == 5 ->
    ordsets:is_element(Term, ["their", "there", "these"]);
is_stopword(_Term) ->
    false.

is_vowel(Char) ->
   lists:member(Char, %%"аеиоуыэюя"
   [1072,1077,1080,1086,1091,1099,1101,1102,1103])
.

ru_stem(Word) ->
    {Start, End} = stemmer_split(Word, fun(X) -> is_vowel(X) end ),

    if length(End) > 0 ->
	    %io:format('1~ts~n', [End]),

	    %% step 1
	    Word2 = drop_perfective(End),

	    %io:format('2ZZ~ts~n', [Word2]),
	    %% step 2 ["и"])
	    {_, Word3} = drop_endings(Word2, [[1080]]),

	    %io:format('3ZZ~ts~n', [Word3]),
	    %% step 3
	    Word4 = drop_derivitional(Word3),

	    %io:format('4ZZ~ts~n', [Word4]),
	    %% step4 %% ["ь"]
	    {Step4, Word5} = drop_endings(Word4, [[1100]]),

	    %io:format('5ZZ~ts~n', [Word5]),

	    WordLast =
		if (Step4 == true) ->
			Word5;
		    true ->
			%%"ейше?"
			Word6 = drop_anywhere(Word5, <<208,181,208,185,209,136,208,181,63>>),
			%% "нн"

			%io:format('6ZZ~ts~n', [Word6]),
			case drop_endings(Word6, [[1085,1085]]) of
			     {true, Word7} ->
				lists:append(Word7, [1085]);
			     {false, _Word7} ->
				Word6
			end
		end,

	    lists:append(Start, WordLast);
        true ->
            Word
    end
.

min_slog_count(Word, MinCount) ->
    min_slog_count(Word, MinCount, 0)
.

drop_anywhere(Word, Regexp) ->
   IoList = unicode:characters_to_binary(Word),
   %io:format("~w~w~n", [IoList, Regexp]),
   BinList = re:replace(IoList, Regexp, ""),

   if length(BinList) == length(IoList) ->
           unicode:characters_to_list(BinList);
       true ->
           Word
   end
.

min_slog_count([ A | [B | Rest]], MinCount, Acc) ->
    if (Acc >= MinCount) ->
    	    Acc;
    	true ->
	    case {is_vowel(A), is_vowel(B)} of
		{true, false} ->
			min_slog_count(Rest, MinCount, Acc+1);
		{_, true} ->
		   	min_slog_count([B | Rest], MinCount, Acc);
		{false, false} ->
			min_slog_count(Rest, MinCount, Acc)
	    end
    end
;

min_slog_count([_Rest], _MinCount, Acc) ->
    Acc
;

min_slog_count([], _MinCount, Acc) ->
    Acc
.


drop_derivitional(Word) ->
     %% ends with [^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$/
     %% два слога (согласная+гласная буквы) заканчивающиеся на ость или ост

     %% ["ость", "ост"]
     case ends_with(Word, [[1086,1089,1090,1100],[1086,1089,1090]]) of
        false ->
            Word;
        Suffix ->
            WithoutSuffix = lists:sublist(Word, 1, length(Word) - length(Suffix)),
            %% проверить, что есть минмум два слога
            HaveSlogCount = min_slog_count(Word, 2),
	    if (HaveSlogCount == 2) ->
	            WithoutSuffix;
	        true ->
	            Word
	    end
    end
.

drop_groups(Word, FullGroup, PartGroup, Length) ->
    case drop_endings(Word, FullGroup) of
    	 {false, _} ->
    	        drop_endings_lengthed(Word, PartGroup, Length);
	 {true, Word2} ->
	        {true, Word2}
    end
.

drop_perfective(Word) ->
    %% try delete var $PERFECTIVEGROUND = '/((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$/';
    PerfectiveGroundEndings = [[1080,1074],
 [1080,1074,1096,1080],
 [1080,1074,1096,1080,1089,1100],
 [1099,1074],
 [1099,1074,1096,1080],
 [1099,1074,1096,1080,1089,1100]], %%["ив", "ивши", "ившись", "ыв", "ывши", "ывшись"],
    PerfectiveGroundEndings2 = [[1072,1074],
 [1072,1074,1096,1080],
 [1072,1074,1096,1080,1089,1100],
 [1103,1074],
 [1103,1074,1096,1080],
 [1103,1074,1096,1080,1089,1100]], %%["ав", "авши", "авшись", "яв", "явши", "явшись"],
    %% without "[ая]"

   {PerfectiveDeleted, Word2} = drop_groups(Word, PerfectiveGroundEndings, PerfectiveGroundEndings2, 1),

   %io:format('XYY~ts~n', [Word2]),

    if (PerfectiveDeleted) ->
    	    Word2;
    	true ->
    	    {_DroppedRefelective, Word3} = drop_reflective(Word2),
    	    %io:format('REF~ts~n', [Word3]),
    	    {DroppedAdjective, Word4} = drop_adjectibe(Word3),
    	    %io:format('ADJ~ts~n', [Word4]),
    	    if DroppedAdjective ->
    	           {_DroppedParticiple, Word5} = drop_participle(Word4),
    	           Word5;
    	       true ->
    	           case drop_verb(Word4) of
    	               {false, Word5} ->
    	               	   %io:format('VERB~ts~n', [Word5]),
    	                   {_DroppedNoun, Word6} = drop_noun(Word5),
    	                   %io:format('NOUN~ts~n', [Word6]),
    	                   Word6;
    	               {true, Word5} ->
    	                   %io:format('VERB~ts~n', [Word5]),
    	                   Word5
    	           end
    	    end
    end
.

drop_noun(Word) ->
    drop_endings(Word,
    [[1072],
 [1077,1074],
 [1086,1074],
 [1080,1077],
 [1100,1077],
 [1077],
 [1080,1103,1084,1080],
 [1103,1084,1080],
 [1072,1084,1080],
 [1077,1080],
 [1080,1080],
 [1080],
 [1080,1077,1081],
 [1077,1081],
 [1086,1081],
 [1080,1081],
 [1081],
 [1080,1103,1084],
 [1103,1084],
 [1080,1077,1084],
 [1077,1084],
 [1072,1084],
 [1086,1084],
 [1086],
 [1091],
 [1072,1093],
 [1080,1103,1093],
 [1103,1093],
 [1099],
 [1100],
 [1080,1102],
 [1100,1102],
 [1102],
 [1080,1103],
 [1100,1103],
 [1103]]
    %%["а","ев","ов","ие","ье","е","иями","ями","ами","еи","ии","и","ией","ей","ой","ий","й","иям","ям","ием","ем","ам","ом","о","у","ах","иях","ях","ы","ь","ию","ью","ю","ия","ья","я"]
    )
.

drop_reflective(Word) ->
    drop_endings(Word, [[1089,1103],[1089,1100]] %%["ся", "сь"]
    )
.

drop_adjectibe(Word) ->
    drop_endings(Word,
    [[1077,1077],
 [1080,1077],
 [1099,1077],
 [1086,1077],
 [1080,1084,1080],
 [1099,1084,1080],
 [1077,1081],
 [1080,1081],
 [1099,1081],
 [1086,1081],
 [1077,1084],
 [1080,1084],
 [1099,1084],
 [1086,1084],
 [1077,1075,1086],
 [1086,1075,1086],
 [1077,1084,1091],
 [1086,1084,1091],
 [1080,1093],
 [1099,1093],
 [1091,1102],
 [1102,1102],
 [1072,1103],
 [1103,1103],
 [1086,1102],
 [1077,1102]]
    %%["ее","ие","ые","ое","ими","ыми","ей","ий","ый","ой","ем","им","ым","ом","его","ого","ему","ому","их","ых","ую","юю","ая","яя","ою","ею"]
    )
.

drop_participle(Word) ->
    Endings = [[1080,1074,1096],[1099,1074,1096],[1091,1102,1097]], %%["ивш", "ывш", "ующ"],
    Endings2 =
    [[1072,1077,1084],
 [1072,1085,1085],
 [1072,1074,1096],
 [1072,1102,1097],
 [1072,1097],
 [1103,1077,1084],
 [1103,1085,1085],
 [1103,1074,1096],
 [1103,1102,1097],
 [1103,1097]],
    %%["аем", "анн", "авш", "ающ", "ащ", "яем", "янн", "явш", "яющ", "ящ"],
    drop_groups(Word, Endings, Endings2, 1)
.

drop_verb(Word) ->
    Endings =
    [[1080,1083,1072],
 [1099,1083,1072],
 [1077,1085,1072],
 [1077,1081,1090,1077],
 [1091,1081,1090,1077],
 [1080,1090,1077],
 [1080,1083,1080],
 [1099,1083,1080],
 [1077,1081],
 [1091,1081],
 [1080,1083],
 [1099,1083],
 [1080,1084],
 [1099,1084],
 [1077,1085],
 [1080,1083,1086],
 [1099,1083,1086],
 [1077,1085,1086],
 [1103,1090],
 [1091,1077,1090],
 [1091,1102,1090],
 [1080,1090],
 [1099,1090],
 [1077,1085,1099],
 [1080,1090,1100],
 [1099,1090,1100],
 [1080,1096,1100],
 [1091,1102],
 [1102]],
    %%["ила","ыла","ена","ейте","уйте","ите","или","ыли","ей","уй","ил","ыл","им","ым","ен","ило","ыло","ено","ят","ует","уют","ит","ыт","ены","ить","ыть","ишь","ую","ю"],
    Endings2 = [[1072,1083,1072],
 [1072,1085,1072],
 [1072,1077,1090,1077],
 [1072,1081,1090,1077],
 [1072,1083,1080],
 [1072,1081],
 [1072,1083],
 [1072,1077,1084],
 [1072,1085],
 [1072,1083,1086],
 [1072,1085,1086],
 [1072,1077,1090],
 [1072,1102,1090],
 [1072,1085,1099],
 [1072,1090,1100],
 [1072,1077,1096,1100],
 [1072,1085,1085,1086],
 [1103,1083,1072],
 [1103,1085,1072],
 [1103,1077,1090,1077],
 [1103,1081,1090,1077],
 [1103,1083,1080],
 [1103,1081],
 [1103,1083],
 [1103,1077,1084],
 [1103,1085],
 [1103,1083,1086],
 [1103,1085,1086],
 [1103,1077,1090],
 [1103,1102,1090],
 [1103,1085,1099],
 [1103,1090,1100],
 [1103,1077,1096,1100],
 [1103,1085,1085,1086]], %%["ала","ана","аете","айте","али","ай","ал","аем","ан","ало","ано","ает","ают","аны","ать","аешь","анно", "яла","яна","яете","яйте","яли","яй","ял","яем","ян","яло","яно","яет","яют","яны","ять","яешь","янно"],
    drop_groups(Word, Endings, Endings2, 1)
.

drop_endings(Word, Endings) ->
	drop_endings_lengthed(Word, Endings, 0)
.

drop_endings_lengthed(Word, Endings, Length) ->
    case ends_with(Word, Endings) of
        false ->
            {false, Word};
        Suffix ->
            %io:format('END~ts~w~n', [Suffix, length(Word) - length(Suffix) - Length]),
	    {true, lists:sublist(Word, 1, length(Word) - length(Suffix) - Length)}
    end
.

ends_with(Word, [Suffix | PossibleLists]) when length(Word) >= length(Suffix) ->
    IsSuffix = lists:suffix(Suffix, Word),
    if (IsSuffix == true) ->
            Suffix;
	true ->
	    ends_with(Word, PossibleLists)
    end
;

ends_with(Word, [_Suffix | PossibleLists]) ->
    ends_with(Word, PossibleLists)
;

ends_with(_Word, []) ->
    false
.
stemmer_split(Word, Fun) ->
	stemmer_split_inner(Word, Fun, [])
.

stemmer_split_inner([Char | Rest], Fun, Acc) ->
    InList = Fun(Char),
    %io:fwrite("~ts  ~ts~w~n", [[Char], Rest, InList]),
    if InList == true ->
          {lists:reverse([Char | Acc]), Rest};
       true ->
 	  stemmer_split_inner(Rest, Fun, [Char | Acc])
    end
;

stemmer_split_inner([], _Fun, Acc) ->
    {Acc, []}
.
	-module(ru_stem).

	-export([
	standard_analyzer_factory/2
	]).

	-define(UPPERCHAR(C), (C >= $A andalso C =< $Z) orelse (C >= 1040 andalso C =< 1071)).
	-define(LOWERCHAR(C), (C >= $a andalso C =< $z) orelse (C >= 1072 andalso C =< 1103)).
	-define(NUMBER(C), (C >= $0 andalso C =< $9)).
	-define(WHITESPACE(C), ((C == $\s) orelse (C == $\n) orelse (C == $\t) orelse (C == $\f) orelse (C == $\r) orelse (C == $\v))).

	%% @doc Tokenize incoming text using roughly the same rules as the
	%% StandardAnalyzerFactory in Lucene/Java.
	standard_analyzer_factory(Text, [MinLengthArg]) ->
	MinLength = list_to_integer(MinLengthArg),
	{ok, standard(unicode:characters_to_list(Text), MinLength, [], [])};
	standard_analyzer_factory(Text, _Other) ->
	{ok, standard(unicode:characters_to_list(Text), 3, [], [])}.

	standard([H \| T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
	H1 = H + 32,
	standard(T, MinLength, [H1\|Acc], ResultAcc);
	standard([H \| T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
	standard(T, MinLength, [H\|Acc], ResultAcc);
	standard([$.,H \| T], MinLength, Acc, ResultAcc) when ?UPPERCHAR(H) ->
	H1 = H + 32,
	standard(T, MinLength, [H1,$.\|Acc], ResultAcc);
	standard([$.,H \| T], MinLength, Acc, ResultAcc) when ?LOWERCHAR(H) orelse ?NUMBER(H) ->
	standard(T, MinLength, [H,$.\|Acc], ResultAcc);
	standard([_X \|T], MinLength, Acc, ResultAcc) ->
	standard_termify(T, MinLength, Acc, ResultAcc);
	standard([], MinLength, Acc, ResultAcc) ->
	standard_termify([], MinLength, Acc, ResultAcc).

	%% Determine if this term is valid, if so, add it to the list we are
	%% generating.
	standard_termify([], _MinLength, [], ResultAcc) ->
	lists:reverse(ResultAcc);
	standard_termify(T, MinLength, [], ResultAcc) ->
	standard(T, MinLength, [], ResultAcc);
	standard_termify(T, MinLength, Acc, ResultAcc) when length(Acc) < MinLength ->
	%% mimic org.apache.lucene.analysis.LengthFilter,
	%% which does not incement position index
	standard(T, MinLength, [], ResultAcc);
	standard_termify(T, MinLength, Acc, ResultAcc) ->
	Term = lists:reverse(Acc),
	case is_russian_stopword(Term) of
	false ->
	RuList = ru_stem(Term),
	%io:format("~ts~n", [RuList]),
	TermBinary = unicode:characters_to_binary(RuList),
	NewResultAcc = [TermBinary\|ResultAcc];
	true ->
	NewResultAcc = [skip\|ResultAcc]
	end,
	standard(T, MinLength, [], NewResultAcc).

	is_russian_stopword(Term) when length(Term) == 1 ->
	ordsets:is_element(Term, ["а","я","у","с","о","к","и","в"]);
	is_russian_stopword(Term) when length(Term) == 2 ->
	ordsets:is_element(Term, ["мы","ты","вы","да","до","ее","ей","ею","же","за","из","им","их","ко","ли","во","на","не","ни","но","ну","об","он","от","по","бы","со","те","то"]);
	is_russian_stopword(Term) when length(Term) == 3 ->
	ordsets:is_element(Term, ["без","чья","еще","вас","вам","наш","где","нет","нее","них","эти","для","был","или","чье","вот","она","они","оно","его","что","как","под","при","все","эта","так","чем","чей","там","кто","тем","это","той","том","мне","уже"]);
	is_russian_stopword(Term) when length(Term) == 4 ->
	ordsets:is_element(Term, ["есть","него","было","весь","либо","всех","даже","того","тоже","была","чего","быть","если","хотя","были","надо"]);
	is_russian_stopword(Term) when length(Term) == 5 ->
	ordsets:is_element(Term, ["более","такой","также","очень","чтобы","всего","может","когда","здесь"]);
	is_russian_stopword(Term) when length(Term) == 6 ->
	ordsets:is_element(Term, ["только","однако"]);
	is_russian_stopword(_Term) ->
	false.


	is_stopword(Term) when length(Term) == 2 ->
	ordsets:is_element(Term, ["an", "as", "at", "be", "by", "if", "in", "is", "it", "no", "of", "on", "or", "to"]);
	is_stopword(Term) when length(Term) == 3 ->
	ordsets:is_element(Term, ["and", "are", "but", "for", "not", "the", "was"]);
	is_stopword(Term) when length(Term) == 4 ->
	ordsets:is_element(Term, ["into", "such", "that", "then", "they", "this", "will"]);
	is_stopword(Term) when length(Term) == 5 ->
	ordsets:is_element(Term, ["their", "there", "these"]);
	is_stopword(_Term) ->
	false.

	is_vowel(Char) ->
	lists:member(Char, %%"аеиоуыэюя"
	[1072,1077,1080,1086,1091,1099,1101,1102,1103])
	.

	ru_stem(Word) ->
	{Start, End} = stemmer_split(Word, fun(X) -> is_vowel(X) end ),

	if length(End) > 0 ->
	%io:format('1~ts~n', [End]),

	%% step 1
	Word2 = drop_perfective(End),

	%io:format('2ZZ~ts~n', [Word2]),
	%% step 2 ["и"])
	{_, Word3} = drop_endings(Word2, [[1080]]),

	%io:format('3ZZ~ts~n', [Word3]),
	%% step 3
	Word4 = drop_derivitional(Word3),

	%io:format('4ZZ~ts~n', [Word4]),
	%% step4 %% ["ь"]
	{Step4, Word5} = drop_endings(Word4, [[1100]]),

	%io:format('5ZZ~ts~n', [Word5]),

	WordLast =
	if (Step4 == true) ->
	Word5;
	true ->
	%%"ейше?"
	Word6 = drop_anywhere(Word5, <<208,181,208,185,209,136,208,181,63>>),
	%% "нн"

	%io:format('6ZZ~ts~n', [Word6]),
	case drop_endings(Word6, [[1085,1085]]) of
	{true, Word7} ->
	lists:append(Word7, [1085]);
	{false, _Word7} ->
	Word6
	end
	end,

	lists:append(Start, WordLast);
	true ->
	Word
	end
	.

	min_slog_count(Word, MinCount) ->
	min_slog_count(Word, MinCount, 0)
	.

	drop_anywhere(Word, Regexp) ->
	IoList = unicode:characters_to_binary(Word),
	%io:format("~w~w~n", [IoList, Regexp]),
	BinList = re:replace(IoList, Regexp, ""),

	if length(BinList) == length(IoList) ->
	unicode:characters_to_list(BinList);
	true ->
	Word
	end
	.

	min_slog_count([ A \| [B \| Rest]], MinCount, Acc) ->
	if (Acc >= MinCount) ->
	Acc;
	true ->
	case {is_vowel(A), is_vowel(B)} of
	{true, false} ->
	min_slog_count(Rest, MinCount, Acc+1);
	{_, true} ->
	min_slog_count([B \| Rest], MinCount, Acc);
	{false, false} ->
	min_slog_count(Rest, MinCount, Acc)
	end
	end
	;

	min_slog_count([_Rest], _MinCount, Acc) ->
	Acc
	;

	min_slog_count([], _MinCount, Acc) ->
	Acc
	.


	drop_derivitional(Word) ->
	%% ends with [^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$/
	%% два слога (согласная+гласная буквы) заканчивающиеся на ость или ост

	%% ["ость", "ост"]
	case ends_with(Word, [[1086,1089,1090,1100],[1086,1089,1090]]) of
	false ->
	Word;
	Suffix ->
	WithoutSuffix = lists:sublist(Word, 1, length(Word) - length(Suffix)),
	%% проверить, что есть минмум два слога
	HaveSlogCount = min_slog_count(Word, 2),
	if (HaveSlogCount == 2) ->
	WithoutSuffix;
	true ->
	Word
	end
	end
	.

	drop_groups(Word, FullGroup, PartGroup, Length) ->
	case drop_endings(Word, FullGroup) of
	{false, _} ->
	drop_endings_lengthed(Word, PartGroup, Length);
	{true, Word2} ->
	{true, Word2}
	end
	.

	drop_perfective(Word) ->
	%% try delete var $PERFECTIVEGROUND = '/((ив\|ивши\|ившись\|ыв\|ывши\|ывшись)\|((?<=[ая])(в\|вши\|вшись)))$/';
	PerfectiveGroundEndings = [[1080,1074],
	[1080,1074,1096,1080],
	[1080,1074,1096,1080,1089,1100],
	[1099,1074],
	[1099,1074,1096,1080],
	[1099,1074,1096,1080,1089,1100]], %%["ив", "ивши", "ившись", "ыв", "ывши", "ывшись"],
	PerfectiveGroundEndings2 = [[1072,1074],
	[1072,1074,1096,1080],
	[1072,1074,1096,1080,1089,1100],
	[1103,1074],
	[1103,1074,1096,1080],
	[1103,1074,1096,1080,1089,1100]], %%["ав", "авши", "авшись", "яв", "явши", "явшись"],
	%% without "[ая]"

	{PerfectiveDeleted, Word2} = drop_groups(Word, PerfectiveGroundEndings, PerfectiveGroundEndings2, 1),

	%io:format('XYY~ts~n', [Word2]),

	if (PerfectiveDeleted) ->
	Word2;
	true ->
	{_DroppedRefelective, Word3} = drop_reflective(Word2),
	%io:format('REF~ts~n', [Word3]),
	{DroppedAdjective, Word4} = drop_adjectibe(Word3),
	%io:format('ADJ~ts~n', [Word4]),
	if DroppedAdjective ->
	{_DroppedParticiple, Word5} = drop_participle(Word4),
	Word5;
	true ->
	case drop_verb(Word4) of
	{false, Word5} ->
	%io:format('VERB~ts~n', [Word5]),
	{_DroppedNoun, Word6} = drop_noun(Word5),
	%io:format('NOUN~ts~n', [Word6]),
	Word6;
	{true, Word5} ->
	%io:format('VERB~ts~n', [Word5]),
	Word5
	end
	end
	end
	.

	drop_noun(Word) ->
	drop_endings(Word,
	[[1072],
	[1077,1074],
	[1086,1074],
	[1080,1077],
	[1100,1077],
	[1077],
	[1080,1103,1084,1080],
	[1103,1084,1080],
	[1072,1084,1080],
	[1077,1080],
	[1080,1080],
	[1080],
	[1080,1077,1081],
	[1077,1081],
	[1086,1081],
	[1080,1081],
	[1081],
	[1080,1103,1084],
	[1103,1084],
	[1080,1077,1084],
	[1077,1084],
	[1072,1084],
	[1086,1084],
	[1086],
	[1091],
	[1072,1093],
	[1080,1103,1093],
	[1103,1093],
	[1099],
	[1100],
	[1080,1102],
	[1100,1102],
	[1102],
	[1080,1103],
	[1100,1103],
	[1103]]
	%%["а","ев","ов","ие","ье","е","иями","ями","ами","еи","ии","и","ией","ей","ой","ий","й","иям","ям","ием","ем","ам","ом","о","у","ах","иях","ях","ы","ь","ию","ью","ю","ия","ья","я"]
	)
	.

	drop_reflective(Word) ->
	drop_endings(Word, [[1089,1103],[1089,1100]] %%["ся", "сь"]
	)
	.

	drop_adjectibe(Word) ->
	drop_endings(Word,
	[[1077,1077],
	[1080,1077],
	[1099,1077],
	[1086,1077],
	[1080,1084,1080],
	[1099,1084,1080],
	[1077,1081],
	[1080,1081],
	[1099,1081],
	[1086,1081],
	[1077,1084],
	[1080,1084],
	[1099,1084],
	[1086,1084],
	[1077,1075,1086],
	[1086,1075,1086],
	[1077,1084,1091],
	[1086,1084,1091],
	[1080,1093],
	[1099,1093],
	[1091,1102],
	[1102,1102],
	[1072,1103],
	[1103,1103],
	[1086,1102],
	[1077,1102]]
	%%["ее","ие","ые","ое","ими","ыми","ей","ий","ый","ой","ем","им","ым","ом","его","ого","ему","ому","их","ых","ую","юю","ая","яя","ою","ею"]
	)
	.

	drop_participle(Word) ->
	Endings = [[1080,1074,1096],[1099,1074,1096],[1091,1102,1097]], %%["ивш", "ывш", "ующ"],
	Endings2 =
	[[1072,1077,1084],
	[1072,1085,1085],
	[1072,1074,1096],
	[1072,1102,1097],
	[1072,1097],
	[1103,1077,1084],
	[1103,1085,1085],
	[1103,1074,1096],
	[1103,1102,1097],
	[1103,1097]],
	%%["аем", "анн", "авш", "ающ", "ащ", "яем", "янн", "явш", "яющ", "ящ"],
	drop_groups(Word, Endings, Endings2, 1)
	.

	drop_verb(Word) ->
	Endings =
	[[1080,1083,1072],
	[1099,1083,1072],
	[1077,1085,1072],
	[1077,1081,1090,1077],
	[1091,1081,1090,1077],
	[1080,1090,1077],
	[1080,1083,1080],
	[1099,1083,1080],
	[1077,1081],
	[1091,1081],
	[1080,1083],
	[1099,1083],
	[1080,1084],
	[1099,1084],
	[1077,1085],
	[1080,1083,1086],
	[1099,1083,1086],
	[1077,1085,1086],
	[1103,1090],
	[1091,1077,1090],
	[1091,1102,1090],
	[1080,1090],
	[1099,1090],
	[1077,1085,1099],
	[1080,1090,1100],
	[1099,1090,1100],
	[1080,1096,1100],
	[1091,1102],
	[1102]],
	%%["ила","ыла","ена","ейте","уйте","ите","или","ыли","ей","уй","ил","ыл","им","ым","ен","ило","ыло","ено","ят","ует","уют","ит","ыт","ены","ить","ыть","ишь","ую","ю"],
	Endings2 = [[1072,1083,1072],
	[1072,1085,1072],
	[1072,1077,1090,1077],
	[1072,1081,1090,1077],
	[1072,1083,1080],
	[1072,1081],
	[1072,1083],
	[1072,1077,1084],
	[1072,1085],
	[1072,1083,1086],
	[1072,1085,1086],
	[1072,1077,1090],
	[1072,1102,1090],
	[1072,1085,1099],
	[1072,1090,1100],
	[1072,1077,1096,1100],
	[1072,1085,1085,1086],
	[1103,1083,1072],
	[1103,1085,1072],
	[1103,1077,1090,1077],
	[1103,1081,1090,1077],
	[1103,1083,1080],
	[1103,1081],
	[1103,1083],
	[1103,1077,1084],
	[1103,1085],
	[1103,1083,1086],
	[1103,1085,1086],
	[1103,1077,1090],
	[1103,1102,1090],
	[1103,1085,1099],
	[1103,1090,1100],
	[1103,1077,1096,1100],
	[1103,1085,1085,1086]], %%["ала","ана","аете","айте","али","ай","ал","аем","ан","ало","ано","ает","ают","аны","ать","аешь","анно", "яла","яна","яете","яйте","яли","яй","ял","яем","ян","яло","яно","яет","яют","яны","ять","яешь","янно"],
	drop_groups(Word, Endings, Endings2, 1)
	.

	drop_endings(Word, Endings) ->
	drop_endings_lengthed(Word, Endings, 0)
	.

	drop_endings_lengthed(Word, Endings, Length) ->
	case ends_with(Word, Endings) of
	false ->
	{false, Word};
	Suffix ->
	%io:format('END~ts~w~n', [Suffix, length(Word) - length(Suffix) - Length]),
	{true, lists:sublist(Word, 1, length(Word) - length(Suffix) - Length)}
	end
	.

	ends_with(Word, [Suffix \| PossibleLists]) when length(Word) >= length(Suffix) ->
	IsSuffix = lists:suffix(Suffix, Word),
	if (IsSuffix == true) ->
	Suffix;
	true ->
	ends_with(Word, PossibleLists)
	end
	;

	ends_with(Word, [_Suffix \| PossibleLists]) ->
	ends_with(Word, PossibleLists)
	;

	ends_with(_Word, []) ->
	false
	.
	stemmer_split(Word, Fun) ->
	stemmer_split_inner(Word, Fun, [])
	.

	stemmer_split_inner([Char \| Rest], Fun, Acc) ->
	InList = Fun(Char),
	%io:fwrite("~ts ~ts~w~n", [[Char], Rest, InList]),
	if InList == true ->
	{lists:reverse([Char \| Acc]), Rest};
	true ->
	stemmer_split_inner(Rest, Fun, [Char \| Acc])
	end
	;

	stemmer_split_inner([], _Fun, Acc) ->
	{Acc, []}
	.