Skip to content

Instantly share code, notes, and snippets.

@fnl
Created September 25, 2014 10:03
Show Gist options
  • Save fnl/21116fa57527946c5dbe to your computer and use it in GitHub Desktop.
Save fnl/21116fa57527946c5dbe to your computer and use it in GitHub Desktop.
A simpler feature extractor for PoS tagging with CRFsuite. Input now only needs to be 'w y', meaning the word itself and the PoS tag, separated by a space.
#!/usr/bin/env python
"""
An example for part-of-speech tagging.
Copyright 2010,2011 Naoaki Okazaki.
"""
import crfutils
# Separator of field values.
separator = ' '
# Field names of the input data.
fields = 'w y'
# Feature template. This template is identical to the one bundled in CRF++
# distribution, but written in a Python object.
templates = (
(('num', 0), ),
(('cap', 0), ),
(('sym', 0), ),
(('p1', 0), ),
(('p2', 0), ),
(('p3', 0), ),
(('p4', 0), ),
(('s1', 0), ),
(('s2', 0), ),
(('s3', 0), ),
(('s4', 0), ),
(('w', 0), ),
(('w', -1), ),
(('w', 1), ),
(('w', -2), ),
(('w', 2), ),
(('w', -2), ('w', -1)),
(('w', -1), ('w', 0)),
(('w', 0), ('w', 1)),
(('w', 1), ('w', 2)),
(('w', -2), ('w', -1), ('w', 0)),
(('w', -1), ('w', 0), ('w', 1)),
(('w', 0), ('w', 1), ('w', 2)),
(('w', -2), ('w', -1), ('w', 0), ('w', 1)),
(('w', -1), ('w', 0), ('w', 1), ('w', 2)),
(('w', -2), ('w', -1), ('w', 0), ('w', 1), ('w', 2)),
(('w', 0), ('w', -1)),
(('w', 0), ('w', -2)),
(('w', 0), ('w', -3)),
(('w', 0), ('w', -4)),
(('w', 0), ('w', -5)),
(('w', 0), ('w', -6)),
(('w', 0), ('w', -7)),
(('w', 0), ('w', -8)),
(('w', 0), ('w', -9)),
(('w', 0), ('w', 1)),
(('w', 0), ('w', 2)),
(('w', 0), ('w', 3)),
(('w', 0), ('w', 4)),
(('w', 0), ('w', 5)),
(('w', 0), ('w', 6)),
(('w', 0), ('w', 7)),
(('w', 0), ('w', 8)),
(('w', 0), ('w', 9)),
)
def observation(v, defval=''):
v['num'] = str(v['w'].isdigit())
v['cap'] = str(v['w'].istitle())
v['sym'] = str(all(not c.isalnum() for c in v['w']))
# Prefixes (length between one to four).
v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
# Suffixes (length between one to four).
v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
def feature_extractor(X):
# Append observations.
for x in X:
observation(x)
# Apply the feature templates.
crfutils.apply_templates(X, templates)
if X:
# Append BOS and EOS features manually
X[0]['F'].append('__BOS__') # BOS feature
X[-1]['F'].append('__EOS__') # EOS feature
if __name__ == '__main__':
crfutils.main(feature_extractor, fields=fields, sep=separator)
@jamal833
Copy link

Hey,
Please can you tell steps to 'how to use it?'

@jamal833
Copy link

Please can you provide its output file on small dataset?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment