Skip to content

Instantly share code, notes, and snippets.

@nschneid
Last active June 15, 2017 22:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nschneid/beed0bcda5b42e530011 to your computer and use it in GitHub Desktop.
Save nschneid/beed0bcda5b42e530011 to your computer and use it in GitHub Desktop.
Given a new-style Penn Treebank English tree, produce the part-of-speech tags according to the Universal Dependencies project.
#!/usr/bin/env python2.7
'''
Converts new-style PTB POS tags to the English tagset from the Universal Dependencies project
(see universal-pos-en.html, from http://universaldependencies.github.io/docs/en/pos/all.html).
There are 17 such tags, expanded from the original 12 Universal POS tags of Petrov et al. 2011.
See "limitations" comment below for some details on our interpretation of the difficult-to-map
categories.
In new-style PTB, TO only applies to prepositional (not infinitival) "to".
Options: -i to include the token inline; -m to enable unofficial tag refinement: AUX.MD for modal auxiliaries
Input: Parse trees, one per line, prefixed with a tab-separated sentence ID.
@author: Nathan Schneider (nschneid@inf.ed.ac.uk)
@since: 2015-08-23
'''
from __future__ import print_function
import sys, fileinput, re
from collections import Counter
INLINE_TOK = DISTINGUISH_MD = False
while len(sys.argv)>1 and sys.argv[1].startswith('-'):
flag = sys.argv[1]
assert flag in ('-i','-m'),'Unrecognized flag: '+flag
if flag=='-i':
INLINE_TOK = True
elif flag=='-m':
DISTINGUISH_MD = True
sys.argv = sys.argv[:1] + sys.argv[2:]
EASY_MAP = {
'JJ': 'ADJ', 'JJR': 'ADJ', 'JJS': 'ADJ',
'RP': 'ADP',
'MD': 'AUX.MD' if DISTINGUISH_MD else 'AUX',
'CC': 'CONJ',
'DT': 'DET', 'PDT': 'DET', 'WDT': 'DET',
'UH': 'INTJ',
'CD': 'NUM',
'POS': 'PART', 'TO': 'PART',
'PRP': 'PRON', 'PRP$': 'PRON', 'WP': 'PRON', 'WP$': 'PRON', 'EX': 'PRON',
'NNP': 'PROPN', 'NNPS': 'PROPN',
'``': 'PUNCT', "''": 'PUNCT', '-LRB-': 'PUNCT', '-RRB-': 'PUNCT', ',': 'PUNCT', '.': 'PUNCT', ':': 'PUNCT', 'HYPH': 'PUNCT',
'#': 'SYM', '$': 'SYM', 'SYM': 'SYM', 'NFP': 'SYM', # see note below about NFP
'FW': 'X', 'LS': 'X', 'XX': 'X', 'ADD': 'X', 'AFX': 'X', 'GW': 'X'
}
AUX_FORMS = ('am', "'m", 'is', "'s", 'are', "'re", 'was', 'were', 'be', 'been', 'being',
'm', 'r', 're', 's', 'v', 've', 'd',
'ai', 'du', 'of', # ai n't, du n no, would of [have]
'have', "'ve", 'has', 'had', "'d", 'having',
'do', 'does', 'did', 'done', 'doing',
'get', 'gets', 'got', 'gotten', 'getting')
"""
Limitations of this script:
(1) The guidelines state:
Some uses of NFP (for lines of hyphens, asterisks or tildes) -> PUNCT
NFP (except for lines of separators, which become PUNCT) -> SYM
However, we ignore the technicality about lines of separators, presuming they will not
occur in the input.
(2) The guidelines prescribe AUX for
verbal tags (VB, VBP, VBG, VBN, VBD, VBZ) when they are forms of be, have, do, and get
when used as an auxiliary (we count passive get as an auxiliary)
We operationalize this by looking for these verb forms with a subsequent VP sister.
This is a fairly robust heuristic, even capturing fronted (topicalized) VPs
thanks to the treebanking convention of including a second VP with a trace:
(SINV (VP-TPC-1 (VBG Sailing)
(PP (IN with)
(NP (DT the) (NNP Roosevelt))))
(VP (VBZ is)
(VP-1 (-NONE- *T*)))
(NP-SBJ (NP (DT the) (NNP Tarawa) (NNP Expeditionary) (NNP Strike) (NNP Force))))))
Depending on what was intended by the guidelines, this rule may not capture everything.
Here is a tree fragment from the English Web Treebank where "have" and "get"
could arguably be considered auxiliaries, but they are followed by an S constituent:
you 'll have * to fight *PRO* to get it resolved *
(NP-SBJ-2 (PRP you))
(VP (MD 'll)
(VP (VB have)
(S (NP-SBJ-2 (-NONE- *))
(VP (TO to)
(VP (VB fight)
(S-PRP (NP-SBJ-2 (-NONE- *PRO*))
(VP (TO to)
(VP (VB get)
(S (NP-SBJ-3 (PRP it))
(VP (VBN resolved)
(NP-3 (-NONE- *))))))))))))
"""
def unitags(tree):
penntags = [(m.group(2), m.group(1), (m.start(0), m.end(0))) \
for m in re.finditer(r'\(([^\s\(\)]+) ([^\s\(\)]+)\)', tree) if m.group(1)!='-NONE-']
result = []
for (w, t, (i,j)) in penntags:
u = EASY_MAP.get(t)
if not u:
if t in ('NN','NNS'): # NOUN : all cases of PTB NN and NNS, except for %, which we retag as SYM.
if w=='%': u = 'SYM'
else: u = 'NOUN'
elif t in ('RB','RBR','RBS','WRB'): # ADV : all uses of PTB tags RB, RBR, RBS, and WRB except the clausal negation not and reduced forms of it, which become PART.
if w.lower() in ('not', "n't"): u = 'PART'
else: u = 'ADV'
elif t=='IN': # SCONJ if complementizer or subordinating conjunction, ADP o.w.
# Diagnostic for SCONJ: ook for an S* constituent starting immediately
# after this word. (May not be perfect, but should be good enough)
if tree[j+1:].strip().startswith('(S'): u = 'SCONJ'
else: u = 'ADP'
else:
assert t in ('VB','VBP','VBG','VBN','VBD','VBZ'),(w,t)
# read the sentence in as a stack (ignoring terminals)
s = tree
k = 0
target_stack_depth = None
right_sister_vp = False
right_sister_s = False
stack = []
while s:
first, s = s.split(' ', 1)
k += len(first)+1
if first.startswith('('): # push
stack.append(first)
if target_stack_depth is not None and len(stack)==target_stack_depth:
# pushed the target word or one of its right sisters
if first.startswith('(VP'):
right_sister_vp = True
elif first.startswith('(S') and k-len(first)-1==j+1:
right_sister_s = True # IMMEDIATE right sister: e.g., (VBD had) (S ... (VP (TO to) ...
else: #terminal or close paren. pop
stack = stack[:-1]
if target_stack_depth is not None and len(stack)<target_stack_depth-1:
# we've popped the parent of the target constit
break
if i==k:
target_stack_depth = len(stack)+1
# our target word is ready to be pushed
assert target_stack_depth is not None
u = 'VERB'
if w.lower() in AUX_FORMS:
if right_sister_vp:
u = 'AUX'
#elif right_sister_s:
# u += '~S'
#u += '^' + str(target_stack_depth)
result.append((w,t,u))
return result
c = Counter()
for ln in fileinput.input():
if not ln.strip(): continue
sentid, tree = ln.split('\t')
tree = tree.replace(')', ') ').replace(' ', ' ').strip()+' '
uu = unitags(tree) # space out the close parens
for w,t,u in uu:
c[t,u] += 1
print(sentid, ' '.join((w+'|'+u if INLINE_TOK else u) for w,t,u in uu), sep='\t')
print(c, file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment