Skip to content

Instantly share code, notes, and snippets.

Last active April 16, 2021 19:18
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brendano/6008945 to your computer and use it in GitHub Desktop.
Save brendano/6008945 to your computer and use it in GitHub Desktop.
Python wrapper for morpha (English lemmatizer)
Wrapper around morpha from
Vaguely follows edu.stanford.nlp.Morphology except we implement with a pipe.
hacky. Would be nice to use cython/swig/ctypes to directly embed morpha.yy.c
as a python extension.
TODO compare linguistic quality to lemmatizer in python's "pattern" package
By Brendan O'Connor (, at
import os,subprocess
#MorphaDir = os.path.join(os.path.dirname(__file__), 'morph')
MorphaDir = '/Users/brendano/sw/nlp/morpha/morph'
MorphaCmd = os.path.join(MorphaDir, 'morpha.ix86_darwin.for_pipe')
MorphaArgs= ['-f', os.path.join(MorphaDir, 'verbstem.list')]
_pipe = None
def get_pipe():
global _pipe
if _pipe is None:
elif _pipe.returncode is not None:
print "Pipe seems to have died, restarting"
return _pipe
def open_pipe():
global _pipe
print "Opening morpha pipe"
_pipe = subprocess.Popen([MorphaCmd] + MorphaArgs, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def process(input):
input = input.strip()
output = None
for retry in range(3):
pipe = get_pipe()
print>>pipe.stdin, input
output = pipe.stdout.readline()
except IOError:
if retry==2: raise
print "Retry on pipe breakage"
return output.rstrip('\n')
## From morph/doc.txt....
#Where the -u option is not used, each input token is expected to be of
#the form <word>_<tag>. For example:
# A_AT1 move_NN1 to_TO stop_VV0 Mr._NNS Gaitskell_NP1 from_II nominating_VVG
#Contractions and punctuation must have been separated out into separate
#tokens. The tagset is assumed to resemble CLAWS-2, in the following
# V... all verbs
# NP... all proper names
# N[^P]... all common nouns
#and for specific cases of ambiguous lexical items:
# 'd_VH... root is 'have'
# 'd_VM... root is 'would'
# 's_VBZ... root is 'be'
# 's_VHZ... root is 'have'
# 's_$... possessive morpheme (also _POS for CLAWS-5)
# ai_VB... root is 'be'
# ai_VH... root is 'have'
# ca_VM... root is 'can'
# sha_VM... root is 'shall'
# wo_VM... root is 'will'
# n't_XX... root is 'not'
def ptb_is_proper(ptb):
return ptb in ('NP','NNP','NNPS')
def ptb2morphtag(ptb):
ptb = ptb.upper()
if ptb.startswith('V'):
return 'V'
if ptb_is_proper(ptb):
return 'NP'
if ptb.startswith('N'):
return 'N'
if ptb == 'MD':
return 'V' # um is this right? it looks like it can take incomplete versions...
if ptb == 'POS':
return '$'
return ''
def lemmatize_seq(words_and_pos, tagset='PENN'):
"""List of (word,pos) pairs. Words are Unicode strings.
Returns list of lemma strings."""
assert tagset=='PENN', "don't support different tagsets yet"
# Decorate the input pairs into one big string that morpha wants,
# Run morpha,
# Then undecorate the output.
goods = [i for i in range(len(words_and_pos)) if words_and_pos[i][0]]
escape_str = '..axsxdxfxqxwxexr..'
new_pairs = []
#for word,pos in words_and_pos:
for i in goods:
word,pos = words_and_pos[i]
assert ' ' not in word
word = word.replace('_', escape_str)
morph_tag = ptb2morphtag(pos)
new_pairs.append((word, morph_tag))
decorated_input = u' '.join(u'{}_{}'.format(word,tag) if tag else word for word,tag in new_pairs)
decorated_input = decorated_input.encode('utf8') # TODO is morpha utf8 safe?
#print "INPUT", decorated_input
result = process(decorated_input)
#print "RESULT", result
lemma_results = []
result_tokens = result.split()
assert len(result_tokens) == len(new_pairs)
for i,lemma in enumerate(result_tokens):
lemma = lemma.split('_')[0] # Rare. I think this is a bug in morpha
#assert '_' not in lemma
lemma = lemma.decode('utf-8','replace') # TODO is morpha utf8 safe?
lemma = lemma.replace(escape_str, '_')
if not ptb_is_proper(words_and_pos[i][1]):
lemma = lemma.lower()
# juxtapose it back in
final_results = ['' for x in range(len(words_and_pos))]
for i,lemma in enumerate(lemma_results):
final_results[goods[i]] = lemma
return final_results
def lemmatize(word,pos, tagset='PENN'):
seq = [(word,pos)]
result = lemmatize_seq(seq, tagset=tagset)
return result[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment