yudanta/stemmer.py

## stemmer.py
#!/usr/bin/env python

'''
stemmer porting from ivanlanin stemmer (php script)
https://github.com/ivanlanin/pengakar
'''
import sys
import re
import json
from os import path

app_path = path.dirname( path.dirname( path.dirname( path.abspath(__file__) ) ) )
sys.path.append( app_path )

#lexiconpath
LEXICON_PATH = path.join(app_path, 'path to lexicon file')

VOWEL = 'a|i|u|e|o'
CONSONANT = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z'

ANY = ''.join([VOWEL, '|', CONSONANT])

class Stemmer():
	global_roots = {}
	global_words = {}

	instances = 0
	temp = None
	dictionary = {}
	rules = {}
	options = {
		'SORT_INSTANCE': False,
		'NO_NO_MATCH': False,
		'NO_DIGIT_ONLY': False,
		'STRICT_CONFIX': False,
	}

	def __init__(self):
		'''
		reading words.txt and set to dictionary
		'''
		f = open(LEXICON_PATH, 'r')
		words = f.read().splitlines(True)
		#print len(words)
		for word in words:
			#explode row, 1st is class, 2nd is lemma
			attribute = word.lower().split('\t')
			key = attribute[1].replace('', '') #removing spaces if any
			key = key.rstrip('\n')
			#set to dictionary
			self.dictionary[key] = {'class': attribute[0], 'lemma': attribute[1].rstrip('\n')}

		'''
		define rules
		'''
		#affixes
		self.rules['affixes'] = [
			[1, ['kah', 'lah', 'tah', 'pun']],
			[1, ['mu', 'ku', 'nya']],
			[0, ['ku', 'kau']],
			[1, ['i', 'kan', 'an']]
		]
		#prefix
		self.rules['prefixes'] = [
			[0, ''.join(['(di|ke|se)', '(', ANY, ')', '(.+)']), ''], # 0
			[0, ''.join(['(ber|ter)', '(', ANY, ')', '(.+)']), ''], # 1, 6 normal
			[0, ''.join(['(be|te)(r)','(',VOWEL,')','(.+)']), ''], # 1, 6 be-rambut
			[0, ''.join(['(be|te)', '(', CONSONANT, ')', '(', ANY, '?)', '(er)(.+)']), ''], # 3, 7 te-besit, te-percaya
			[0, '(bel|pel)(ajar|unjur)', ''], # ajar, unjur
			[0, '(me|pe)(l|m|n|r|w|y)(.+)', ''], # 10, 20: merawat, pemain
			[0, '(mem|pem)(b|f|v)(.+)', ''], # 11 23: membuat, pembuat
			[0, '(men|pen)(c|d|j|z)(.+)', ''], # 14 27: mencabut, pencabut
			[0, '(meng|peng)(g|h|q|x)(.+)', ''], # 16 29: menggiring, penghasut
			[0, ''.join(['(meng|peng)', '(', VOWEL, ')','(.+)']), ''], # 17 30 meng-anjurkan, peng-anjur
			[0, ''.join(['(mem|pem)', '(', VOWEL, ')', '(.+)']), 'p'], # 13 26: memerkosa, pemerkosa
			[0, ''.join(['(men|pen)', '(', VOWEL, ')', '(.+)']), 't'], # 15 28 menutup, penutup
			[0, ''.join(['(meng|peng)', '(', VOWEL, ')', '(.+)']), 'k'], # 17 30 mengalikan, pengali
			[0, ''.join(['(meny|peny)', '(', VOWEL, ')', '(.+)']), 's'], # menyucikan, penyucian
			[0, ''.join(['(mem)(p)', '(', CONSONANT, ')', '(.+)']), ''], # memproklamasikan
			[0, ''.join(['(pem)', '(', CONSONANT, ')', '(.+)']), 'p'], # pemrogram
			[0, ''.join(['(men|pen)(t)', '(', CONSONANT, ')', '(.+)']), ''], # mentransmisikan pentransmisian
			[0, ''.join(['(meng|peng)(k)', '(', CONSONANT, ')', '(.+)']), ''], # mengkristalkan pengkristalan
			[0, ''.join(['(men|pen)(s)', '(', CONSONANT, ')', '(.+)']), ''], # mensyaratkan pensyaratan
			[0, ''.join(['(menge|penge)', '(', CONSONANT, ')', '(.+)']), ''], #swarabakti: mengepel
			[0, ''.join(['(mempe)(r)', '(', VOWEL, ')', '(.+)']), ''], # 21
			[0, ''.join(['(memper)', '(', ANY, ')', '(.+)']), ''], # 22
			[0, ''.join(['(pe)', '(', ANY, ')', '(.+)']), ''], # 20
			[0, ''.join(['(per)', '(', ANY, ')', '(.+)']), ''], # 21
			[0, ''.join(['(pel)', '(', CONSONANT, ')', '(.+)']), ''], # 32 pelbagai, other?
			[0, '(mem)(punya)', ''], # Exception: mempunya
			[0, '(pen)(yair)', 's'], #Exception: penyair > syair
		]

		#disallow confixes
		self.rules['disallowed_confixes'] = [
			['ber-', '-i'],
			['ke-', '-i'],
			['pe-', '-kan'],
			['di-', '-an'],
			['meng-', '-an'],
			['ter-', '-an'],
			['ku-', '-an'],
		]
		#allomorphism
		self.rules['allomorphs'] = {
			'be': ['be-', 'ber-', 'bel-'],
			'te': ['te-', 'ter-', 'tel-'],
			'pe': ['pe-', 'per-', 'pel-', 'pen-', 'pem-', 'peng-', 'peny-', 'penge-'],
			'me': ['me-', 'men-', 'mem-', 'meng-', 'meny-', 'menge-'],
		}

		return None

	def stemwords(self, word):
		words = {}
		raw_word = re.compile('[^a-zA-Z0-9\-]').split(word)

		#remove all digit words
		for w in raw_word:
			if self.options['NO_DIGIT_ONLY'] and re.find('^\d+$', w):
				#print 'not in here'
				pass
			else:
				key = w.lower()
				#build words dictionary
				words[key] = {}
				words[key]['count'] = 0
				words[key]['count'] += 1

		for (key, word) in words.items():
			words[key]['roots'] = self.stem(key)

			if words[key]['roots'] != None:
				if len(words[key]['roots']) == None and self.options['NO_NO_MATCH']:
					del words[key]
					pass

		word_count = len(words)


		#ignore sort instance first
		'''
		if self.options['SORT_INSTANCE']:
			print 'with sort instance', words
		else:
			print 'after sort', words
		'''

		return words


	def stem(self, word):
		#preprocess, create empty affix if word already in dictionary
		word = word.replace(' ', '')
		self.global_roots = {word: ''}

		if word in self.dictionary:
			#self.global_roots[word]['affixes'] = []
			self.global_roots[word] = {}
			self.global_roots[word]['affixes'] = []

		#if hash dash, also try to find each elements
		if '-' in word:
			words_with_dash = word.split('-')
			for with_dash in words_with_dash:
				self.global_roots[with_dash] = {}
				self.global_roots[with_dash]['affixes'] = []

		#process: to find suffix, pronoun prefix, and other (3 times Asian)
		for rules in self.rules['affixes']:
			#print rules
			is_suffix = rules[0]
			affixes = rules[1]

			for affix in affixes:
				pattern = ''.join(['(.+)', '(', affix, ')']) if is_suffix else ''.join(['(', affix, ')', '(.+)'])
				#print "add to root", self.global_roots
				self.add_root(self.global_roots, [is_suffix, pattern, ''])

		for x in range(0, 3):
			for rule in self.rules['prefixes']:
				for (lemma, attrib) in self.global_roots.items():
					self.add_root({lemma:attrib}, rule)
				#self.add_root({'membaca':{'affixes':['-kan']}}, rule)

		#postprocess 1 (select valid affixes)
		for (lemma, attrib) in self.global_roots.items():
			if lemma not in self.dictionary:
				del self.global_roots[lemma]
				pass

			#check Escape if we don't have to check valid confix pairs
			if self.options['STRICT_CONFIX']:
				pass

			if 'affixes' in attrib:
				affixes = attrib['affixes']
			else:
				affixes = None

			for disallow_confix in self.rules['disallowed_confixes']:
				prefix = disallow_confix[0]
				suffix = disallow_confix[1]

				prefix_key = prefix[:2]
				if prefix_key in self.rules['allomorphs']:
					for allomorf in self.rules['allomorphs'][prefix_key]:
						if affixes in self.rules['allomorphs'][prefix_key] and affixes in suffix:
							del self.global_roots[lemma]

				else:
					if affixes == prefix and affixes == suffix:
						del self.global_roots[lemma]


		#post process 2 handle suffix and prefix
		for (lemma, attrib) in self.global_roots.items():
			if 'affixes' in attrib:
				affixes = attrib['affixes']
			else:
				affixes = None

			attrib['lemma'] = self.dictionary[lemma]['lemma']
			attrib['class'] = self.dictionary[lemma]['class']

			#init first then delete
			attrib['suffixes'] = []
			attrib['prefixes'] = []

			#Divide affixes into suffixes and prefixes
			for affix in affixes:
				if affix[:1] == '-':
					attrib['suffixes'].append(affix)
				else:
					attrib['prefixes'].append(affix)

			#print "current attrib", attrib
			'''
			if attrib['suffixes'] is None:
				del attrib['suffixes']
			if attrib['prefixes'] is None:
				del attrib['prefixes']
			'''
			#reverse suffix order
			if 'suffixes' in attrib:
				attrib['suffixes'].reverse

			#print "current attrib", attrib

			self.global_roots[lemma] = attrib

		#print "final global roots", self.global_roots
		return self.global_roots

	def add_root(self, roots, rule):
		is_suffix = rule[0]
		pattern = ''.join(['^', rule[1], '$'])
		variant = rule[2]

		for (lemma, attrib) in roots.items():
			root_regex = re.compile(pattern)
			result = root_regex.findall(lemma)

			#print "output regex for pattern", lemma, result, pattern

			matches = []
			#print result #the outputs are tuples, so we need to make it as list
			if len(result) > 0:
				for res in result:
					to_list = list(res)
					for x in to_list:
						matches.append(x)

			if len(matches) > 0:
				new_lemma = ''
				new_affix = ''
				affix_index = 1 if is_suffix else 0
				for x in xrange(0,len(matches)):
					if x != affix_index:
						new_lemma = ''.join([new_lemma, matches[x]])

				#if have any variants
				if variant:
					new_lemma = ''.join([variant, new_lemma])

				# Affix, add - before (suffix), after (prefix)
				suffix_val = '-' if (is_suffix) else ''
				new_affix = ''.join([suffix_val])
				new_affix = ''.join([new_affix, matches[affix_index]])
				suffix_val = '' if (is_suffix) else '-'
				new_affix = ''.join([new_affix, suffix_val])
				#build as list
				new_affix = [new_affix]

				if 'affixes' in attrib:
					new_affix = attrib['affixes'] + new_affix

				#push to self.root
				self.global_roots[new_lemma] = {'affixes': new_affix}

				return None


welcomeMsg = 'to use: python stemmer.py "word"'

def main():
	if len(sys.argv) <= 1:
		print welcomeMsg
		sys.exit(1)
	else:
		words = sys.argv[1]

		stem = Stemmer()
		result = stem.stemwords(words)

		if result:
			roots = result[words]['roots']
			#for key in roots:
			#	print roots[key]['lemma']

			print result
		else:
			print "ouch snap!"

if __name__=="__main__":
	main()
	#!/usr/bin/env python

	'''
	stemmer porting from ivanlanin stemmer (php script)
	https://github.com/ivanlanin/pengakar
	'''
	import sys
	import re
	import json
	from os import path

	app_path = path.dirname( path.dirname( path.dirname( path.abspath(__file__) ) ) )
	sys.path.append( app_path )

	#lexiconpath
	LEXICON_PATH = path.join(app_path, 'path to lexicon file')

	VOWEL = 'a\|i\|u\|e\|o'
	CONSONANT = 'b\|c\|d\|f\|g\|h\|j\|k\|l\|m\|n\|p\|q\|r\|s\|t\|v\|w\|x\|y\|z'

	ANY = ''.join([VOWEL, '\|', CONSONANT])

	class Stemmer():
	global_roots = {}
	global_words = {}

	instances = 0
	temp = None
	dictionary = {}
	rules = {}
	options = {
	'SORT_INSTANCE': False,
	'NO_NO_MATCH': False,
	'NO_DIGIT_ONLY': False,
	'STRICT_CONFIX': False,
	}

	def __init__(self):
	'''
	reading words.txt and set to dictionary
	'''
	f = open(LEXICON_PATH, 'r')
	words = f.read().splitlines(True)
	#print len(words)
	for word in words:
	#explode row, 1st is class, 2nd is lemma
	attribute = word.lower().split('\t')
	key = attribute[1].replace('', '') #removing spaces if any
	key = key.rstrip('\n')
	#set to dictionary
	self.dictionary[key] = {'class': attribute[0], 'lemma': attribute[1].rstrip('\n')}

	'''
	define rules
	'''
	#affixes
	self.rules['affixes'] = [
	[1, ['kah', 'lah', 'tah', 'pun']],
	[1, ['mu', 'ku', 'nya']],
	[0, ['ku', 'kau']],
	[1, ['i', 'kan', 'an']]
	]
	#prefix
	self.rules['prefixes'] = [
	[0, ''.join(['(di\|ke\|se)', '(', ANY, ')', '(.+)']), ''], # 0
	[0, ''.join(['(ber\|ter)', '(', ANY, ')', '(.+)']), ''], # 1, 6 normal
	[0, ''.join(['(be\|te)(r)','(',VOWEL,')','(.+)']), ''], # 1, 6 be-rambut
	[0, ''.join(['(be\|te)', '(', CONSONANT, ')', '(', ANY, '?)', '(er)(.+)']), ''], # 3, 7 te-besit, te-percaya
	[0, '(bel\|pel)(ajar\|unjur)', ''], # ajar, unjur
	[0, '(me\|pe)(l\|m\|n\|r\|w\|y)(.+)', ''], # 10, 20: merawat, pemain
	[0, '(mem\|pem)(b\|f\|v)(.+)', ''], # 11 23: membuat, pembuat
	[0, '(men\|pen)(c\|d\|j\|z)(.+)', ''], # 14 27: mencabut, pencabut
	[0, '(meng\|peng)(g\|h\|q\|x)(.+)', ''], # 16 29: menggiring, penghasut
	[0, ''.join(['(meng\|peng)', '(', VOWEL, ')','(.+)']), ''], # 17 30 meng-anjurkan, peng-anjur
	[0, ''.join(['(mem\|pem)', '(', VOWEL, ')', '(.+)']), 'p'], # 13 26: memerkosa, pemerkosa
	[0, ''.join(['(men\|pen)', '(', VOWEL, ')', '(.+)']), 't'], # 15 28 menutup, penutup
	[0, ''.join(['(meng\|peng)', '(', VOWEL, ')', '(.+)']), 'k'], # 17 30 mengalikan, pengali
	[0, ''.join(['(meny\|peny)', '(', VOWEL, ')', '(.+)']), 's'], # menyucikan, penyucian
	[0, ''.join(['(mem)(p)', '(', CONSONANT, ')', '(.+)']), ''], # memproklamasikan
	[0, ''.join(['(pem)', '(', CONSONANT, ')', '(.+)']), 'p'], # pemrogram
	[0, ''.join(['(men\|pen)(t)', '(', CONSONANT, ')', '(.+)']), ''], # mentransmisikan pentransmisian
	[0, ''.join(['(meng\|peng)(k)', '(', CONSONANT, ')', '(.+)']), ''], # mengkristalkan pengkristalan
	[0, ''.join(['(men\|pen)(s)', '(', CONSONANT, ')', '(.+)']), ''], # mensyaratkan pensyaratan
	[0, ''.join(['(menge\|penge)', '(', CONSONANT, ')', '(.+)']), ''], #swarabakti: mengepel
	[0, ''.join(['(mempe)(r)', '(', VOWEL, ')', '(.+)']), ''], # 21
	[0, ''.join(['(memper)', '(', ANY, ')', '(.+)']), ''], # 22
	[0, ''.join(['(pe)', '(', ANY, ')', '(.+)']), ''], # 20
	[0, ''.join(['(per)', '(', ANY, ')', '(.+)']), ''], # 21
	[0, ''.join(['(pel)', '(', CONSONANT, ')', '(.+)']), ''], # 32 pelbagai, other?
	[0, '(mem)(punya)', ''], # Exception: mempunya
	[0, '(pen)(yair)', 's'], #Exception: penyair > syair
	]

	#disallow confixes
	self.rules['disallowed_confixes'] = [
	['ber-', '-i'],
	['ke-', '-i'],
	['pe-', '-kan'],
	['di-', '-an'],
	['meng-', '-an'],
	['ter-', '-an'],
	['ku-', '-an'],
	]
	#allomorphism
	self.rules['allomorphs'] = {
	'be': ['be-', 'ber-', 'bel-'],
	'te': ['te-', 'ter-', 'tel-'],
	'pe': ['pe-', 'per-', 'pel-', 'pen-', 'pem-', 'peng-', 'peny-', 'penge-'],
	'me': ['me-', 'men-', 'mem-', 'meng-', 'meny-', 'menge-'],
	}

	return None

	def stemwords(self, word):
	words = {}
	raw_word = re.compile('[^a-zA-Z0-9\-]').split(word)

	#remove all digit words
	for w in raw_word:
	if self.options['NO_DIGIT_ONLY'] and re.find('^\d+$', w):
	#print 'not in here'
	pass
	else:
	key = w.lower()
	#build words dictionary
	words[key] = {}
	words[key]['count'] = 0
	words[key]['count'] += 1

	for (key, word) in words.items():
	words[key]['roots'] = self.stem(key)

	if words[key]['roots'] != None:
	if len(words[key]['roots']) == None and self.options['NO_NO_MATCH']:
	del words[key]
	pass

	word_count = len(words)


	#ignore sort instance first
	'''
	if self.options['SORT_INSTANCE']:
	print 'with sort instance', words
	else:
	print 'after sort', words
	'''

	return words


	def stem(self, word):
	#preprocess, create empty affix if word already in dictionary
	word = word.replace(' ', '')
	self.global_roots = {word: ''}

	if word in self.dictionary:
	#self.global_roots[word]['affixes'] = []
	self.global_roots[word] = {}
	self.global_roots[word]['affixes'] = []

	#if hash dash, also try to find each elements
	if '-' in word:
	words_with_dash = word.split('-')
	for with_dash in words_with_dash:
	self.global_roots[with_dash] = {}
	self.global_roots[with_dash]['affixes'] = []

	#process: to find suffix, pronoun prefix, and other (3 times Asian)
	for rules in self.rules['affixes']:
	#print rules
	is_suffix = rules[0]
	affixes = rules[1]

	for affix in affixes:
	pattern = ''.join(['(.+)', '(', affix, ')']) if is_suffix else ''.join(['(', affix, ')', '(.+)'])
	#print "add to root", self.global_roots
	self.add_root(self.global_roots, [is_suffix, pattern, ''])

	for x in range(0, 3):
	for rule in self.rules['prefixes']:
	for (lemma, attrib) in self.global_roots.items():
	self.add_root({lemma:attrib}, rule)
	#self.add_root({'membaca':{'affixes':['-kan']}}, rule)

	#postprocess 1 (select valid affixes)
	for (lemma, attrib) in self.global_roots.items():
	if lemma not in self.dictionary:
	del self.global_roots[lemma]
	pass

	#check Escape if we don't have to check valid confix pairs
	if self.options['STRICT_CONFIX']:
	pass

	if 'affixes' in attrib:
	affixes = attrib['affixes']
	else:
	affixes = None

	for disallow_confix in self.rules['disallowed_confixes']:
	prefix = disallow_confix[0]
	suffix = disallow_confix[1]

	prefix_key = prefix[:2]
	if prefix_key in self.rules['allomorphs']:
	for allomorf in self.rules['allomorphs'][prefix_key]:
	if affixes in self.rules['allomorphs'][prefix_key] and affixes in suffix:
	del self.global_roots[lemma]

	else:
	if affixes == prefix and affixes == suffix:
	del self.global_roots[lemma]


	#post process 2 handle suffix and prefix
	for (lemma, attrib) in self.global_roots.items():
	if 'affixes' in attrib:
	affixes = attrib['affixes']
	else:
	affixes = None

	attrib['lemma'] = self.dictionary[lemma]['lemma']
	attrib['class'] = self.dictionary[lemma]['class']

	#init first then delete
	attrib['suffixes'] = []
	attrib['prefixes'] = []

	#Divide affixes into suffixes and prefixes
	for affix in affixes:
	if affix[:1] == '-':
	attrib['suffixes'].append(affix)
	else:
	attrib['prefixes'].append(affix)

	#print "current attrib", attrib
	'''
	if attrib['suffixes'] is None:
	del attrib['suffixes']
	if attrib['prefixes'] is None:
	del attrib['prefixes']
	'''
	#reverse suffix order
	if 'suffixes' in attrib:
	attrib['suffixes'].reverse

	#print "current attrib", attrib

	self.global_roots[lemma] = attrib

	#print "final global roots", self.global_roots
	return self.global_roots

	def add_root(self, roots, rule):
	is_suffix = rule[0]
	pattern = ''.join(['^', rule[1], '$'])
	variant = rule[2]

	for (lemma, attrib) in roots.items():
	root_regex = re.compile(pattern)
	result = root_regex.findall(lemma)

	#print "output regex for pattern", lemma, result, pattern

	matches = []
	#print result #the outputs are tuples, so we need to make it as list
	if len(result) > 0:
	for res in result:
	to_list = list(res)
	for x in to_list:
	matches.append(x)

	if len(matches) > 0:
	new_lemma = ''
	new_affix = ''
	affix_index = 1 if is_suffix else 0
	for x in xrange(0,len(matches)):
	if x != affix_index:
	new_lemma = ''.join([new_lemma, matches[x]])

	#if have any variants
	if variant:
	new_lemma = ''.join([variant, new_lemma])

	# Affix, add - before (suffix), after (prefix)
	suffix_val = '-' if (is_suffix) else ''
	new_affix = ''.join([suffix_val])
	new_affix = ''.join([new_affix, matches[affix_index]])
	suffix_val = '' if (is_suffix) else '-'
	new_affix = ''.join([new_affix, suffix_val])
	#build as list
	new_affix = [new_affix]

	if 'affixes' in attrib:
	new_affix = attrib['affixes'] + new_affix

	#push to self.root
	self.global_roots[new_lemma] = {'affixes': new_affix}

	return None


	welcomeMsg = 'to use: python stemmer.py "word"'

	def main():
	if len(sys.argv) <= 1:
	print welcomeMsg
	sys.exit(1)
	else:
	words = sys.argv[1]

	stem = Stemmer()
	result = stem.stemwords(words)

	if result:
	roots = result[words]['roots']
	#for key in roots:
	# print roots[key]['lemma']

	print result
	else:
	print "ouch snap!"

	if __name__=="__main__":
	main()