Skip to content

Instantly share code, notes, and snippets.

@jimregan
Created March 15, 2012 20:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jimregan/2046870 to your computer and use it in GitHub Desktop.
Save jimregan/2046870 to your computer and use it in GitHub Desktop.
Irish stemmer
routines (
R1 R2 RV
initial_morph
mark_regions
noun_sfx
deriv
verb_sfx
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* Latin 1 */
stringdef a' hex 'E1' // a-acute
stringdef e' hex 'E9' // e-acute
stringdef i' hex 'ED' // i-acute
stringdef o' hex 'F3' // o-acute
stringdef u' hex 'FA' // u-acute
define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
gopast v setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define initial_morph as (
[substring] among (
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
(delete)
// verbs
'd{'}'
(delete)
'd{'}fh'
(<- 'f')
// other contractions
'm{'}' 'b{'}'
(delete)
'sh'
(<- 's')
'mb'
(<- 'b')
'gc'
(<- 'c')
'nd'
(<- 'd')
'bhf'
(<- 'f')
'ng'
(<- 'g')
'bp'
(<- 'p')
'ts'
(<- 's')
'dt'
(<- 't')
// Lenition
'bh'
(<- 'b')
'ch'
(<- 'c')
'dh'
(<- 'd')
'fh'
(<- 'f')
'gh'
(<- 'g')
'mh'
(<- 'm')
'ph'
(<- 'p')
'th'
(<- 't')
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define noun_sfx as (
[substring] among (
'amh' 'eamh' 'abh' 'eabh'
'aibh' 'ibh' 'aimh' 'imh'
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
(R1 delete)
'ire' 'ir{i'}' 'aire' 'air{i'}'
(R2 delete)
)
)
define deriv as (
[substring] among (
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
'arcacht' 'arcachta{i'}' 'arcachta'
(<- 'arc') // monarcacht -> monarc
'gineach' 'gineas' 'ginis'
(<- 'gin')
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
(<- 'graf')
'paite' 'patach' 'pataigh' 'patacha'
(<- 'paite')
'{o'}ideach' '{o'}ideacha' '{o'}idigh'
(<- '{o'}id')
)
)
define verb_sfx as (
[substring] among (
'imid' 'aimid' '{i'}mid' 'a{i'}mid'
'faidh' 'fidh'
(RV delete)
'ain'
'eadh' 'adh'
'{a'}il'
'tear' 'tar'
(R1 delete)
)
)
)
define stem as (
do initial_morph
do mark_regions
backwards (
do noun_sfx
do deriv
do verb_sfx
)
)
a
ach
ag
agus
an
aon
ar
arna
as
b'
ba
beirt
bhúr
caoga
ceathair
ceathrar
chomh
chtó
chuig
chun
cois
céad
cúig
cúigear
d'
daichead
dar
de
deich
deichniúr
den
dhá
do
don
dtí
dár
faoi
faoin
faoina
faoinár
fara
fiche
gach
gan
go
gur
haon
hocht
i
iad
idir
in
ina
ins
inár
is
le
leis
lena
lenár
m'
mar
mo
na
nach
naoi
naonúr
níor
nócha
ocht
ochtar
os
roimh
sa
seacht
seachtar
seachtó
seasca
seisear
siad
sibh
sinn
sna
tar
thar
thú
triúr
trí
trína
trínár
tríocha
um
ár
é
éis
í
ó
ón
óna
ónár
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment