Skip to content

Instantly share code, notes, and snippets.

@h8nor
Last active May 26, 2020 18:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save h8nor/21b8cc8da032701cd664526a6f9c02de to your computer and use it in GitHub Desktop.
Save h8nor/21b8cc8da032701cd664526a6f9c02de to your computer and use it in GitHub Desktop.
Porter Stemmer RUS in VISUAL BASIC 6
' It follow the algorithm "stem_Unicode.sbl" definition presented in:
' Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3,
' pp 130-137, (http://snowball.tartarus.org/algorithms/russian/stemmer.html)
' TO USE THE PROGRAM CALL THE FUNCTION PORTERSTEMMERRU.
' THE WORD TO BE STEMMED SHOULD BE PASSED AS THE ARGUEMENT.
' THE STRING RETURNED BY THE FUNCTION IS THE STEMMED WORD.
Option Explicit
Option Base 0
'12345678901234567890123456789012345bopoh13@ya67890123456789012345678901234567890
Public Function PorterStemmerRU(ByVal word As String) As String ' Test failed
' Переписано с http://snowball.tartarus.org/algorithms/russian/stemmer.html
'2 Причастие совершенного вида
Const PERFECTiveGERUNDs As String = "[иы]в [иы]вши [иы]вшись"
'1 Возвратное
Const REFLEXive As String = "сь ся"
'2 Причастие
Const PARTICIPLEs As String = "[иы]вш ующ"
'1 Прилагательное
Const ADJECTive As String = "[ео]го [иы]ми [ео]му " _
& "[еиоы]е [еиоы]й [еиоы]м [еоую]ю [иы]х [ая]я"
'2 Глагол
Const VERBs As String = "ен ена ено ены ишь ую ует уют ите [еу]йте " _
& "[иы]ла [иы]ли [иы]ло [иы]ть [еу]й [иы]л [иы]м [иыя]т ю"
'1 Имя существительное
Const NOUN As String = "ией ием иям иях иями [ая]ми [ео]в [иь]е [еи]и " _
& "[еио]й [аеоя]м [ая]х [иь]ю [иь]я а е и й о у ь ы ю я"
'1 Прилагательное привосходной степени
Const SUPERLATive As String = "ейш ейше"
'1 Словообразующее окончание в r2
Const DERIVATIONAL As String = "ост ость"
Dim rV As Byte, r2 As Byte ' r1 As Byte
If Len(word) > 0 Then word = Replace(LCase(word), "*", "") Else Exit Function
' rV - начало области слова после первой гласной (Если гласных нет = 0)
' r1 - начало области слова "Гласная-Согласная" с начала слова
r2 = FindRegions(rV, word) ' r2 - начало области "Гласная-Согласная" после r1
' [Шаг 1] Если существует окончание PERFECTIVE GERUND – удалить и завершить
If Not RemoveEndings(word, Array(Replace(PERFECTiveGERUNDs, "[иы]", ""), _
PERFECTiveGERUNDs), rV) Then
' Если существует окончание REFLEXIVE – удалить
RemoveEndings word, REFLEXive, rV
' Удалить одно из окончаний и завершить: PARTICIPLE + ADJECTIVE, VERB, NOUN
If RemoveEndings(word, ADJECTive, rV) Then
RemoveEndings word, Array("ем нн вш ющ щ", PARTICIPLEs), rV
Else
If Not RemoveEndings(word, Array("ешь нно ем ли ны ть " _
& "ете йте ла на ло но ет ют й л н", VERBs), rV) Then _
RemoveEndings word, NOUN, rV ' не УЮТ и не МЛЕЮТ, но БЕСЕДуют
End If
End If
' [Шаг 2] Если слово окончивается на "и" - удалить
RemoveEndings word, "и", rV
' [ШАГ 3] Если существует окончание DERIVATIONAL в r2 - удалить
RemoveEndings word, DERIVATIONAL, r2
' [ШАГ 4] Удалить одно из окончаний слова: (Н)Н + SUPERLATIVE, (Н)Н, Ь
RemoveEndings word, SUPERLATive, rV
If RemoveEndings(word, "нн", rV) Then word = word & "н"
RemoveEndings word, "ь", rV
PorterStemmerRU = word
End Function
Private Function RemoveEndings(ByRef word As String, ByVal regex As Variant, _
ByVal region As Byte) As Boolean ' Удалить окончание (самое длинное)
Dim rAff As Byte, prefix As String, regMatch As Variant
prefix = Mid(word, 1, IIf(region, region, 1) - 1) ' prefix <- region
word = Mid(word, Len(prefix) + 1)
If IsArray(regex) Then
For Each regMatch In Split(regex(0))
If word Like "*[ая]" & regMatch Then ' Если найден аффикс
word = Left(word, Len(word) - Len(regMatch))
RemoveEndings = True: Exit For
End If
Next regMatch: regex = regex(1)
End If
If Not RemoveEndings Then
For Each regMatch In Split(regex)
rAff = InStr(regMatch, "]") + 1 ' rAff - начало области после [list]
On Error Resume Next
For region = 2 To rAff - 2
If rAff < 2 Then region = 1: rAff = 2 ' Если нет [list]
If word Like "*" & Mid(regMatch, region, 1) & Mid(regMatch, rAff) Then
regMatch = Mid(regMatch, region, 1) & Mid(regMatch, rAff)
word = Left(word, Len(word) - Len(regMatch))
RemoveEndings = True: Exit For
End If: If region = 1 Then Exit For
Next region: If RemoveEndings Then Exit For
On Error GoTo 0
Next regMatch
End If: word = prefix & word
End Function
Private Function FindRegions(ByRef rV As Byte, ByVal word As String) As Byte
Dim prevChar As String, Char As String, state As Byte, i As Byte
If isVowel(Left(word, 1)) Then rV = 2: state = 1 ' После первой гласной
For i = 2 To Len(word)
prevChar = Mid(word, i - 1, 1): Char = Mid(word, i, 1)
Select Case state
Case 0: If isVowel(Char) Then rV = i + 1: state = 1
Case 1: If Not isVowel(Char) And isVowel(prevChar) Then state = 2
Case 2: If Not isVowel(Char) And isVowel(prevChar) Then _
FindRegions = i + 1: Exit For
End Select
Next i
End Function
Private Function isVowel(ByVal Char As String) As Boolean
Const VOWEL As String = "[аеёиоуыэюя]"
isVowel = InStr(Mid(VOWEL, 2, Len(VOWEL) - 1), Char)
End Function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment