Skip to content

Instantly share code, notes, and snippets.

@yudanta
Created February 13, 2016 15:20
Show Gist options
  • Save yudanta/9319c10d50002ff72e2a to your computer and use it in GitHub Desktop.
Save yudanta/9319c10d50002ff72e2a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
'''
stemmer porting from ivanlanin stemmer (php script)
https://github.com/ivanlanin/pengakar
'''
import sys
import re
import json
from os import path
app_path = path.dirname( path.dirname( path.dirname( path.abspath(__file__) ) ) )
sys.path.append( app_path )
#lexiconpath
LEXICON_PATH = path.join(app_path, 'path to lexicon file')
VOWEL = 'a|i|u|e|o'
CONSONANT = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z'
ANY = ''.join([VOWEL, '|', CONSONANT])
class Stemmer():
global_roots = {}
global_words = {}
instances = 0
temp = None
dictionary = {}
rules = {}
options = {
'SORT_INSTANCE': False,
'NO_NO_MATCH': False,
'NO_DIGIT_ONLY': False,
'STRICT_CONFIX': False,
}
def __init__(self):
'''
reading words.txt and set to dictionary
'''
f = open(LEXICON_PATH, 'r')
words = f.read().splitlines(True)
#print len(words)
for word in words:
#explode row, 1st is class, 2nd is lemma
attribute = word.lower().split('\t')
key = attribute[1].replace('', '') #removing spaces if any
key = key.rstrip('\n')
#set to dictionary
self.dictionary[key] = {'class': attribute[0], 'lemma': attribute[1].rstrip('\n')}
'''
define rules
'''
#affixes
self.rules['affixes'] = [
[1, ['kah', 'lah', 'tah', 'pun']],
[1, ['mu', 'ku', 'nya']],
[0, ['ku', 'kau']],
[1, ['i', 'kan', 'an']]
]
#prefix
self.rules['prefixes'] = [
[0, ''.join(['(di|ke|se)', '(', ANY, ')', '(.+)']), ''], # 0
[0, ''.join(['(ber|ter)', '(', ANY, ')', '(.+)']), ''], # 1, 6 normal
[0, ''.join(['(be|te)(r)','(',VOWEL,')','(.+)']), ''], # 1, 6 be-rambut
[0, ''.join(['(be|te)', '(', CONSONANT, ')', '(', ANY, '?)', '(er)(.+)']), ''], # 3, 7 te-besit, te-percaya
[0, '(bel|pel)(ajar|unjur)', ''], # ajar, unjur
[0, '(me|pe)(l|m|n|r|w|y)(.+)', ''], # 10, 20: merawat, pemain
[0, '(mem|pem)(b|f|v)(.+)', ''], # 11 23: membuat, pembuat
[0, '(men|pen)(c|d|j|z)(.+)', ''], # 14 27: mencabut, pencabut
[0, '(meng|peng)(g|h|q|x)(.+)', ''], # 16 29: menggiring, penghasut
[0, ''.join(['(meng|peng)', '(', VOWEL, ')','(.+)']), ''], # 17 30 meng-anjurkan, peng-anjur
[0, ''.join(['(mem|pem)', '(', VOWEL, ')', '(.+)']), 'p'], # 13 26: memerkosa, pemerkosa
[0, ''.join(['(men|pen)', '(', VOWEL, ')', '(.+)']), 't'], # 15 28 menutup, penutup
[0, ''.join(['(meng|peng)', '(', VOWEL, ')', '(.+)']), 'k'], # 17 30 mengalikan, pengali
[0, ''.join(['(meny|peny)', '(', VOWEL, ')', '(.+)']), 's'], # menyucikan, penyucian
[0, ''.join(['(mem)(p)', '(', CONSONANT, ')', '(.+)']), ''], # memproklamasikan
[0, ''.join(['(pem)', '(', CONSONANT, ')', '(.+)']), 'p'], # pemrogram
[0, ''.join(['(men|pen)(t)', '(', CONSONANT, ')', '(.+)']), ''], # mentransmisikan pentransmisian
[0, ''.join(['(meng|peng)(k)', '(', CONSONANT, ')', '(.+)']), ''], # mengkristalkan pengkristalan
[0, ''.join(['(men|pen)(s)', '(', CONSONANT, ')', '(.+)']), ''], # mensyaratkan pensyaratan
[0, ''.join(['(menge|penge)', '(', CONSONANT, ')', '(.+)']), ''], #swarabakti: mengepel
[0, ''.join(['(mempe)(r)', '(', VOWEL, ')', '(.+)']), ''], # 21
[0, ''.join(['(memper)', '(', ANY, ')', '(.+)']), ''], # 22
[0, ''.join(['(pe)', '(', ANY, ')', '(.+)']), ''], # 20
[0, ''.join(['(per)', '(', ANY, ')', '(.+)']), ''], # 21
[0, ''.join(['(pel)', '(', CONSONANT, ')', '(.+)']), ''], # 32 pelbagai, other?
[0, '(mem)(punya)', ''], # Exception: mempunya
[0, '(pen)(yair)', 's'], #Exception: penyair > syair
]
#disallow confixes
self.rules['disallowed_confixes'] = [
['ber-', '-i'],
['ke-', '-i'],
['pe-', '-kan'],
['di-', '-an'],
['meng-', '-an'],
['ter-', '-an'],
['ku-', '-an'],
]
#allomorphism
self.rules['allomorphs'] = {
'be': ['be-', 'ber-', 'bel-'],
'te': ['te-', 'ter-', 'tel-'],
'pe': ['pe-', 'per-', 'pel-', 'pen-', 'pem-', 'peng-', 'peny-', 'penge-'],
'me': ['me-', 'men-', 'mem-', 'meng-', 'meny-', 'menge-'],
}
return None
def stemwords(self, word):
words = {}
raw_word = re.compile('[^a-zA-Z0-9\-]').split(word)
#remove all digit words
for w in raw_word:
if self.options['NO_DIGIT_ONLY'] and re.find('^\d+$', w):
#print 'not in here'
pass
else:
key = w.lower()
#build words dictionary
words[key] = {}
words[key]['count'] = 0
words[key]['count'] += 1
for (key, word) in words.items():
words[key]['roots'] = self.stem(key)
if words[key]['roots'] != None:
if len(words[key]['roots']) == None and self.options['NO_NO_MATCH']:
del words[key]
pass
word_count = len(words)
#ignore sort instance first
'''
if self.options['SORT_INSTANCE']:
print 'with sort instance', words
else:
print 'after sort', words
'''
return words
def stem(self, word):
#preprocess, create empty affix if word already in dictionary
word = word.replace(' ', '')
self.global_roots = {word: ''}
if word in self.dictionary:
#self.global_roots[word]['affixes'] = []
self.global_roots[word] = {}
self.global_roots[word]['affixes'] = []
#if hash dash, also try to find each elements
if '-' in word:
words_with_dash = word.split('-')
for with_dash in words_with_dash:
self.global_roots[with_dash] = {}
self.global_roots[with_dash]['affixes'] = []
#process: to find suffix, pronoun prefix, and other (3 times Asian)
for rules in self.rules['affixes']:
#print rules
is_suffix = rules[0]
affixes = rules[1]
for affix in affixes:
pattern = ''.join(['(.+)', '(', affix, ')']) if is_suffix else ''.join(['(', affix, ')', '(.+)'])
#print "add to root", self.global_roots
self.add_root(self.global_roots, [is_suffix, pattern, ''])
for x in range(0, 3):
for rule in self.rules['prefixes']:
for (lemma, attrib) in self.global_roots.items():
self.add_root({lemma:attrib}, rule)
#self.add_root({'membaca':{'affixes':['-kan']}}, rule)
#postprocess 1 (select valid affixes)
for (lemma, attrib) in self.global_roots.items():
if lemma not in self.dictionary:
del self.global_roots[lemma]
pass
#check Escape if we don't have to check valid confix pairs
if self.options['STRICT_CONFIX']:
pass
if 'affixes' in attrib:
affixes = attrib['affixes']
else:
affixes = None
for disallow_confix in self.rules['disallowed_confixes']:
prefix = disallow_confix[0]
suffix = disallow_confix[1]
prefix_key = prefix[:2]
if prefix_key in self.rules['allomorphs']:
for allomorf in self.rules['allomorphs'][prefix_key]:
if affixes in self.rules['allomorphs'][prefix_key] and affixes in suffix:
del self.global_roots[lemma]
else:
if affixes == prefix and affixes == suffix:
del self.global_roots[lemma]
#post process 2 handle suffix and prefix
for (lemma, attrib) in self.global_roots.items():
if 'affixes' in attrib:
affixes = attrib['affixes']
else:
affixes = None
attrib['lemma'] = self.dictionary[lemma]['lemma']
attrib['class'] = self.dictionary[lemma]['class']
#init first then delete
attrib['suffixes'] = []
attrib['prefixes'] = []
#Divide affixes into suffixes and prefixes
for affix in affixes:
if affix[:1] == '-':
attrib['suffixes'].append(affix)
else:
attrib['prefixes'].append(affix)
#print "current attrib", attrib
'''
if attrib['suffixes'] is None:
del attrib['suffixes']
if attrib['prefixes'] is None:
del attrib['prefixes']
'''
#reverse suffix order
if 'suffixes' in attrib:
attrib['suffixes'].reverse
#print "current attrib", attrib
self.global_roots[lemma] = attrib
#print "final global roots", self.global_roots
return self.global_roots
def add_root(self, roots, rule):
is_suffix = rule[0]
pattern = ''.join(['^', rule[1], '$'])
variant = rule[2]
for (lemma, attrib) in roots.items():
root_regex = re.compile(pattern)
result = root_regex.findall(lemma)
#print "output regex for pattern", lemma, result, pattern
matches = []
#print result #the outputs are tuples, so we need to make it as list
if len(result) > 0:
for res in result:
to_list = list(res)
for x in to_list:
matches.append(x)
if len(matches) > 0:
new_lemma = ''
new_affix = ''
affix_index = 1 if is_suffix else 0
for x in xrange(0,len(matches)):
if x != affix_index:
new_lemma = ''.join([new_lemma, matches[x]])
#if have any variants
if variant:
new_lemma = ''.join([variant, new_lemma])
# Affix, add - before (suffix), after (prefix)
suffix_val = '-' if (is_suffix) else ''
new_affix = ''.join([suffix_val])
new_affix = ''.join([new_affix, matches[affix_index]])
suffix_val = '' if (is_suffix) else '-'
new_affix = ''.join([new_affix, suffix_val])
#build as list
new_affix = [new_affix]
if 'affixes' in attrib:
new_affix = attrib['affixes'] + new_affix
#push to self.root
self.global_roots[new_lemma] = {'affixes': new_affix}
return None
welcomeMsg = 'to use: python stemmer.py "word"'
def main():
if len(sys.argv) <= 1:
print welcomeMsg
sys.exit(1)
else:
words = sys.argv[1]
stem = Stemmer()
result = stem.stemwords(words)
if result:
roots = result[words]['roots']
#for key in roots:
# print roots[key]['lemma']
print result
else:
print "ouch snap!"
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment