Created
February 13, 2016 15:20
-
-
Save yudanta/9319c10d50002ff72e2a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
stemmer porting from ivanlanin stemmer (php script) | |
https://github.com/ivanlanin/pengakar | |
''' | |
import sys | |
import re | |
import json | |
from os import path | |
app_path = path.dirname( path.dirname( path.dirname( path.abspath(__file__) ) ) ) | |
sys.path.append( app_path ) | |
#lexiconpath | |
LEXICON_PATH = path.join(app_path, 'path to lexicon file') | |
VOWEL = 'a|i|u|e|o' | |
CONSONANT = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z' | |
ANY = ''.join([VOWEL, '|', CONSONANT]) | |
class Stemmer(): | |
global_roots = {} | |
global_words = {} | |
instances = 0 | |
temp = None | |
dictionary = {} | |
rules = {} | |
options = { | |
'SORT_INSTANCE': False, | |
'NO_NO_MATCH': False, | |
'NO_DIGIT_ONLY': False, | |
'STRICT_CONFIX': False, | |
} | |
def __init__(self): | |
''' | |
reading words.txt and set to dictionary | |
''' | |
f = open(LEXICON_PATH, 'r') | |
words = f.read().splitlines(True) | |
#print len(words) | |
for word in words: | |
#explode row, 1st is class, 2nd is lemma | |
attribute = word.lower().split('\t') | |
key = attribute[1].replace('', '') #removing spaces if any | |
key = key.rstrip('\n') | |
#set to dictionary | |
self.dictionary[key] = {'class': attribute[0], 'lemma': attribute[1].rstrip('\n')} | |
''' | |
define rules | |
''' | |
#affixes | |
self.rules['affixes'] = [ | |
[1, ['kah', 'lah', 'tah', 'pun']], | |
[1, ['mu', 'ku', 'nya']], | |
[0, ['ku', 'kau']], | |
[1, ['i', 'kan', 'an']] | |
] | |
#prefix | |
self.rules['prefixes'] = [ | |
[0, ''.join(['(di|ke|se)', '(', ANY, ')', '(.+)']), ''], # 0 | |
[0, ''.join(['(ber|ter)', '(', ANY, ')', '(.+)']), ''], # 1, 6 normal | |
[0, ''.join(['(be|te)(r)','(',VOWEL,')','(.+)']), ''], # 1, 6 be-rambut | |
[0, ''.join(['(be|te)', '(', CONSONANT, ')', '(', ANY, '?)', '(er)(.+)']), ''], # 3, 7 te-besit, te-percaya | |
[0, '(bel|pel)(ajar|unjur)', ''], # ajar, unjur | |
[0, '(me|pe)(l|m|n|r|w|y)(.+)', ''], # 10, 20: merawat, pemain | |
[0, '(mem|pem)(b|f|v)(.+)', ''], # 11 23: membuat, pembuat | |
[0, '(men|pen)(c|d|j|z)(.+)', ''], # 14 27: mencabut, pencabut | |
[0, '(meng|peng)(g|h|q|x)(.+)', ''], # 16 29: menggiring, penghasut | |
[0, ''.join(['(meng|peng)', '(', VOWEL, ')','(.+)']), ''], # 17 30 meng-anjurkan, peng-anjur | |
[0, ''.join(['(mem|pem)', '(', VOWEL, ')', '(.+)']), 'p'], # 13 26: memerkosa, pemerkosa | |
[0, ''.join(['(men|pen)', '(', VOWEL, ')', '(.+)']), 't'], # 15 28 menutup, penutup | |
[0, ''.join(['(meng|peng)', '(', VOWEL, ')', '(.+)']), 'k'], # 17 30 mengalikan, pengali | |
[0, ''.join(['(meny|peny)', '(', VOWEL, ')', '(.+)']), 's'], # menyucikan, penyucian | |
[0, ''.join(['(mem)(p)', '(', CONSONANT, ')', '(.+)']), ''], # memproklamasikan | |
[0, ''.join(['(pem)', '(', CONSONANT, ')', '(.+)']), 'p'], # pemrogram | |
[0, ''.join(['(men|pen)(t)', '(', CONSONANT, ')', '(.+)']), ''], # mentransmisikan pentransmisian | |
[0, ''.join(['(meng|peng)(k)', '(', CONSONANT, ')', '(.+)']), ''], # mengkristalkan pengkristalan | |
[0, ''.join(['(men|pen)(s)', '(', CONSONANT, ')', '(.+)']), ''], # mensyaratkan pensyaratan | |
[0, ''.join(['(menge|penge)', '(', CONSONANT, ')', '(.+)']), ''], #swarabakti: mengepel | |
[0, ''.join(['(mempe)(r)', '(', VOWEL, ')', '(.+)']), ''], # 21 | |
[0, ''.join(['(memper)', '(', ANY, ')', '(.+)']), ''], # 22 | |
[0, ''.join(['(pe)', '(', ANY, ')', '(.+)']), ''], # 20 | |
[0, ''.join(['(per)', '(', ANY, ')', '(.+)']), ''], # 21 | |
[0, ''.join(['(pel)', '(', CONSONANT, ')', '(.+)']), ''], # 32 pelbagai, other? | |
[0, '(mem)(punya)', ''], # Exception: mempunya | |
[0, '(pen)(yair)', 's'], #Exception: penyair > syair | |
] | |
#disallow confixes | |
self.rules['disallowed_confixes'] = [ | |
['ber-', '-i'], | |
['ke-', '-i'], | |
['pe-', '-kan'], | |
['di-', '-an'], | |
['meng-', '-an'], | |
['ter-', '-an'], | |
['ku-', '-an'], | |
] | |
#allomorphism | |
self.rules['allomorphs'] = { | |
'be': ['be-', 'ber-', 'bel-'], | |
'te': ['te-', 'ter-', 'tel-'], | |
'pe': ['pe-', 'per-', 'pel-', 'pen-', 'pem-', 'peng-', 'peny-', 'penge-'], | |
'me': ['me-', 'men-', 'mem-', 'meng-', 'meny-', 'menge-'], | |
} | |
return None | |
def stemwords(self, word): | |
words = {} | |
raw_word = re.compile('[^a-zA-Z0-9\-]').split(word) | |
#remove all digit words | |
for w in raw_word: | |
if self.options['NO_DIGIT_ONLY'] and re.find('^\d+$', w): | |
#print 'not in here' | |
pass | |
else: | |
key = w.lower() | |
#build words dictionary | |
words[key] = {} | |
words[key]['count'] = 0 | |
words[key]['count'] += 1 | |
for (key, word) in words.items(): | |
words[key]['roots'] = self.stem(key) | |
if words[key]['roots'] != None: | |
if len(words[key]['roots']) == None and self.options['NO_NO_MATCH']: | |
del words[key] | |
pass | |
word_count = len(words) | |
#ignore sort instance first | |
''' | |
if self.options['SORT_INSTANCE']: | |
print 'with sort instance', words | |
else: | |
print 'after sort', words | |
''' | |
return words | |
def stem(self, word): | |
#preprocess, create empty affix if word already in dictionary | |
word = word.replace(' ', '') | |
self.global_roots = {word: ''} | |
if word in self.dictionary: | |
#self.global_roots[word]['affixes'] = [] | |
self.global_roots[word] = {} | |
self.global_roots[word]['affixes'] = [] | |
#if hash dash, also try to find each elements | |
if '-' in word: | |
words_with_dash = word.split('-') | |
for with_dash in words_with_dash: | |
self.global_roots[with_dash] = {} | |
self.global_roots[with_dash]['affixes'] = [] | |
#process: to find suffix, pronoun prefix, and other (3 times Asian) | |
for rules in self.rules['affixes']: | |
#print rules | |
is_suffix = rules[0] | |
affixes = rules[1] | |
for affix in affixes: | |
pattern = ''.join(['(.+)', '(', affix, ')']) if is_suffix else ''.join(['(', affix, ')', '(.+)']) | |
#print "add to root", self.global_roots | |
self.add_root(self.global_roots, [is_suffix, pattern, '']) | |
for x in range(0, 3): | |
for rule in self.rules['prefixes']: | |
for (lemma, attrib) in self.global_roots.items(): | |
self.add_root({lemma:attrib}, rule) | |
#self.add_root({'membaca':{'affixes':['-kan']}}, rule) | |
#postprocess 1 (select valid affixes) | |
for (lemma, attrib) in self.global_roots.items(): | |
if lemma not in self.dictionary: | |
del self.global_roots[lemma] | |
pass | |
#check Escape if we don't have to check valid confix pairs | |
if self.options['STRICT_CONFIX']: | |
pass | |
if 'affixes' in attrib: | |
affixes = attrib['affixes'] | |
else: | |
affixes = None | |
for disallow_confix in self.rules['disallowed_confixes']: | |
prefix = disallow_confix[0] | |
suffix = disallow_confix[1] | |
prefix_key = prefix[:2] | |
if prefix_key in self.rules['allomorphs']: | |
for allomorf in self.rules['allomorphs'][prefix_key]: | |
if affixes in self.rules['allomorphs'][prefix_key] and affixes in suffix: | |
del self.global_roots[lemma] | |
else: | |
if affixes == prefix and affixes == suffix: | |
del self.global_roots[lemma] | |
#post process 2 handle suffix and prefix | |
for (lemma, attrib) in self.global_roots.items(): | |
if 'affixes' in attrib: | |
affixes = attrib['affixes'] | |
else: | |
affixes = None | |
attrib['lemma'] = self.dictionary[lemma]['lemma'] | |
attrib['class'] = self.dictionary[lemma]['class'] | |
#init first then delete | |
attrib['suffixes'] = [] | |
attrib['prefixes'] = [] | |
#Divide affixes into suffixes and prefixes | |
for affix in affixes: | |
if affix[:1] == '-': | |
attrib['suffixes'].append(affix) | |
else: | |
attrib['prefixes'].append(affix) | |
#print "current attrib", attrib | |
''' | |
if attrib['suffixes'] is None: | |
del attrib['suffixes'] | |
if attrib['prefixes'] is None: | |
del attrib['prefixes'] | |
''' | |
#reverse suffix order | |
if 'suffixes' in attrib: | |
attrib['suffixes'].reverse | |
#print "current attrib", attrib | |
self.global_roots[lemma] = attrib | |
#print "final global roots", self.global_roots | |
return self.global_roots | |
def add_root(self, roots, rule): | |
is_suffix = rule[0] | |
pattern = ''.join(['^', rule[1], '$']) | |
variant = rule[2] | |
for (lemma, attrib) in roots.items(): | |
root_regex = re.compile(pattern) | |
result = root_regex.findall(lemma) | |
#print "output regex for pattern", lemma, result, pattern | |
matches = [] | |
#print result #the outputs are tuples, so we need to make it as list | |
if len(result) > 0: | |
for res in result: | |
to_list = list(res) | |
for x in to_list: | |
matches.append(x) | |
if len(matches) > 0: | |
new_lemma = '' | |
new_affix = '' | |
affix_index = 1 if is_suffix else 0 | |
for x in xrange(0,len(matches)): | |
if x != affix_index: | |
new_lemma = ''.join([new_lemma, matches[x]]) | |
#if have any variants | |
if variant: | |
new_lemma = ''.join([variant, new_lemma]) | |
# Affix, add - before (suffix), after (prefix) | |
suffix_val = '-' if (is_suffix) else '' | |
new_affix = ''.join([suffix_val]) | |
new_affix = ''.join([new_affix, matches[affix_index]]) | |
suffix_val = '' if (is_suffix) else '-' | |
new_affix = ''.join([new_affix, suffix_val]) | |
#build as list | |
new_affix = [new_affix] | |
if 'affixes' in attrib: | |
new_affix = attrib['affixes'] + new_affix | |
#push to self.root | |
self.global_roots[new_lemma] = {'affixes': new_affix} | |
return None | |
welcomeMsg = 'to use: python stemmer.py "word"' | |
def main(): | |
if len(sys.argv) <= 1: | |
print welcomeMsg | |
sys.exit(1) | |
else: | |
words = sys.argv[1] | |
stem = Stemmer() | |
result = stem.stemwords(words) | |
if result: | |
roots = result[words]['roots'] | |
#for key in roots: | |
# print roots[key]['lemma'] | |
print result | |
else: | |
print "ouch snap!" | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment