nschneid/ptbpos2uni.py

## ptbpos2uni.py
#!/usr/bin/env python2.7
'''
Converts new-style PTB POS tags to the English tagset from the Universal Dependencies project
(see universal-pos-en.html, from http://universaldependencies.github.io/docs/en/pos/all.html).
There are 17 such tags, expanded from the original 12 Universal POS tags of Petrov et al. 2011.
See "limitations" comment below for some details on our interpretation of the difficult-to-map
categories.

In new-style PTB, TO only applies to prepositional (not infinitival) "to".

Options: -i to include the token inline; -m to enable unofficial tag refinement: AUX.MD for modal auxiliaries
Input: Parse trees, one per line, prefixed with a tab-separated sentence ID.

@author: Nathan Schneider (nschneid@inf.ed.ac.uk)
@since: 2015-08-23
'''

from __future__ import print_function
import sys, fileinput, re
from collections import Counter

INLINE_TOK = DISTINGUISH_MD = False
while len(sys.argv)>1 and sys.argv[1].startswith('-'):
    flag = sys.argv[1]
    assert flag in ('-i','-m'),'Unrecognized flag: '+flag
    if flag=='-i':
        INLINE_TOK = True
    elif flag=='-m':
        DISTINGUISH_MD = True
    sys.argv = sys.argv[:1] + sys.argv[2:]

EASY_MAP = {
    'JJ': 'ADJ', 'JJR': 'ADJ', 'JJS': 'ADJ',
    'RP': 'ADP',
    'MD': 'AUX.MD' if DISTINGUISH_MD else 'AUX',
    'CC': 'CONJ',
    'DT': 'DET', 'PDT': 'DET', 'WDT': 'DET',
    'UH': 'INTJ',
    'CD': 'NUM',
    'POS': 'PART', 'TO': 'PART',
    'PRP': 'PRON', 'PRP$': 'PRON', 'WP': 'PRON', 'WP$': 'PRON', 'EX': 'PRON',
    'NNP': 'PROPN', 'NNPS': 'PROPN',
    '``': 'PUNCT', "''": 'PUNCT', '-LRB-': 'PUNCT', '-RRB-': 'PUNCT', ',': 'PUNCT', '.': 'PUNCT', ':': 'PUNCT', 'HYPH': 'PUNCT',
    '#': 'SYM', '$': 'SYM', 'SYM': 'SYM', 'NFP': 'SYM',   # see note below about NFP
    'FW': 'X', 'LS': 'X', 'XX': 'X', 'ADD': 'X', 'AFX': 'X', 'GW': 'X'
}

AUX_FORMS = ('am', "'m", 'is', "'s", 'are', "'re", 'was', 'were', 'be', 'been', 'being',
             'm', 'r', 're', 's', 'v', 've', 'd',
             'ai', 'du', 'of', # ai n't, du n no, would of [have]
             'have', "'ve", 'has', 'had', "'d", 'having',
             'do', 'does', 'did', 'done', 'doing',
             'get', 'gets', 'got', 'gotten', 'getting')

"""
Limitations of this script:

(1) The guidelines state:

    Some uses of NFP (for lines of hyphens, asterisks or tildes) -> PUNCT
    NFP (except for lines of separators, which become PUNCT) -> SYM

However, we ignore the technicality about lines of separators, presuming they will not
occur in the input.

(2) The guidelines prescribe AUX for

    verbal tags (VB, VBP, VBG, VBN, VBD, VBZ) when they are forms of be, have, do, and get
    when used as an auxiliary (we count passive get as an auxiliary)

We operationalize this by looking for these verb forms with a subsequent VP sister.
This is a fairly robust heuristic, even capturing fronted (topicalized) VPs
thanks to the treebanking convention of including a second VP with a trace:

(SINV (VP-TPC-1 (VBG Sailing)
                (PP (IN with)
                    (NP (DT the) (NNP Roosevelt))))
                    (VP (VBZ is)
                        (VP-1 (-NONE- *T*)))
                    (NP-SBJ (NP (DT the) (NNP Tarawa) (NNP Expeditionary) (NNP Strike) (NNP Force))))))

Depending on what was intended by the guidelines, this rule may not capture everything.
Here is a tree fragment from the English Web Treebank where "have" and "get"
could arguably be considered auxiliaries, but they are followed by an S constituent:

    you 'll have * to fight *PRO* to get it resolved *

(NP-SBJ-2 (PRP you))
(VP (MD 'll)
    (VP (VB have)
        (S (NP-SBJ-2 (-NONE- *))
           (VP (TO to)
               (VP (VB fight)
                   (S-PRP (NP-SBJ-2 (-NONE- *PRO*))
                          (VP (TO to)
                              (VP (VB get)
                                  (S (NP-SBJ-3 (PRP it))
                                     (VP (VBN resolved)
                                         (NP-3 (-NONE- *))))))))))))
"""

def unitags(tree):
    penntags = [(m.group(2), m.group(1), (m.start(0), m.end(0))) \
        for m in re.finditer(r'\(([^\s\(\)]+) ([^\s\(\)]+)\)', tree) if m.group(1)!='-NONE-']
    result = []
    for (w, t, (i,j)) in penntags:
        u = EASY_MAP.get(t)
        if not u:
            if t in ('NN','NNS'):   # NOUN : all cases of PTB NN and NNS, except for %, which we retag as SYM.
                if w=='%': u = 'SYM'
                else: u = 'NOUN'
            elif t in ('RB','RBR','RBS','WRB'):    # ADV : all uses of PTB tags RB, RBR, RBS, and WRB except the clausal negation not and reduced forms of it, which become PART.
                if w.lower() in ('not', "n't"): u = 'PART'
                else: u = 'ADV'
            elif t=='IN':   # SCONJ if complementizer or subordinating conjunction, ADP o.w.
                # Diagnostic for SCONJ: ook for an S* constituent starting immediately
                # after this word. (May not be perfect, but should be good enough)
                if tree[j+1:].strip().startswith('(S'): u = 'SCONJ'
                else: u = 'ADP'
            else:
                assert t in ('VB','VBP','VBG','VBN','VBD','VBZ'),(w,t)

                # read the sentence in as a stack (ignoring terminals)
                s = tree
                k = 0
                target_stack_depth = None
                right_sister_vp = False
                right_sister_s = False
                stack = []
                while s:
                    first, s = s.split(' ', 1)
                    k += len(first)+1
                    if first.startswith('('):   # push
                        stack.append(first)
                        if target_stack_depth is not None and len(stack)==target_stack_depth:
                            # pushed the target word or one of its right sisters
                            if first.startswith('(VP'):
                                right_sister_vp = True
                            elif first.startswith('(S') and k-len(first)-1==j+1:
                                right_sister_s = True   # IMMEDIATE right sister: e.g., (VBD had) (S ... (VP (TO to) ...
                    else:   #terminal or close paren. pop
                        stack = stack[:-1]
                        if target_stack_depth is not None and len(stack)<target_stack_depth-1:
                            # we've popped the parent of the target constit
                            break
                    if i==k:
                        target_stack_depth = len(stack)+1
                        # our target word is ready to be pushed

                assert target_stack_depth is not None

                u = 'VERB'
                if w.lower() in AUX_FORMS:
                    if right_sister_vp:
                        u = 'AUX'
                    #elif right_sister_s:
                    #    u += '~S'
                    #u += '^' + str(target_stack_depth)
        result.append((w,t,u))
    return result

c = Counter()
for ln in fileinput.input():
    if not ln.strip(): continue
    sentid, tree = ln.split('\t')
    tree = tree.replace(')', ') ').replace('  ', ' ').strip()+' '
    uu = unitags(tree)  # space out the close parens
    for w,t,u in uu:
        c[t,u] += 1
    print(sentid, ' '.join((w+'|'+u if INLINE_TOK else u) for w,t,u in uu), sep='\t')
print(c, file=sys.stderr)
	#!/usr/bin/env python2.7
	'''
	Converts new-style PTB POS tags to the English tagset from the Universal Dependencies project
	(see universal-pos-en.html, from http://universaldependencies.github.io/docs/en/pos/all.html).
	There are 17 such tags, expanded from the original 12 Universal POS tags of Petrov et al. 2011.
	See "limitations" comment below for some details on our interpretation of the difficult-to-map
	categories.

	In new-style PTB, TO only applies to prepositional (not infinitival) "to".

	Options: -i to include the token inline; -m to enable unofficial tag refinement: AUX.MD for modal auxiliaries
	Input: Parse trees, one per line, prefixed with a tab-separated sentence ID.

	@author: Nathan Schneider (nschneid@inf.ed.ac.uk)
	@since: 2015-08-23
	'''

	from __future__ import print_function
	import sys, fileinput, re
	from collections import Counter

	INLINE_TOK = DISTINGUISH_MD = False
	while len(sys.argv)>1 and sys.argv[1].startswith('-'):
	flag = sys.argv[1]
	assert flag in ('-i','-m'),'Unrecognized flag: '+flag
	if flag=='-i':
	INLINE_TOK = True
	elif flag=='-m':
	DISTINGUISH_MD = True
	sys.argv = sys.argv[:1] + sys.argv[2:]

	EASY_MAP = {
	'JJ': 'ADJ', 'JJR': 'ADJ', 'JJS': 'ADJ',
	'RP': 'ADP',
	'MD': 'AUX.MD' if DISTINGUISH_MD else 'AUX',
	'CC': 'CONJ',
	'DT': 'DET', 'PDT': 'DET', 'WDT': 'DET',
	'UH': 'INTJ',
	'CD': 'NUM',
	'POS': 'PART', 'TO': 'PART',
	'PRP': 'PRON', 'PRP$': 'PRON', 'WP': 'PRON', 'WP$': 'PRON', 'EX': 'PRON',
	'NNP': 'PROPN', 'NNPS': 'PROPN',
	'``': 'PUNCT', "''": 'PUNCT', '-LRB-': 'PUNCT', '-RRB-': 'PUNCT', ',': 'PUNCT', '.': 'PUNCT', ':': 'PUNCT', 'HYPH': 'PUNCT',
	'#': 'SYM', '$': 'SYM', 'SYM': 'SYM', 'NFP': 'SYM', # see note below about NFP
	'FW': 'X', 'LS': 'X', 'XX': 'X', 'ADD': 'X', 'AFX': 'X', 'GW': 'X'
	}

	AUX_FORMS = ('am', "'m", 'is', "'s", 'are', "'re", 'was', 'were', 'be', 'been', 'being',
	'm', 'r', 're', 's', 'v', 've', 'd',
	'ai', 'du', 'of', # ai n't, du n no, would of [have]
	'have', "'ve", 'has', 'had', "'d", 'having',
	'do', 'does', 'did', 'done', 'doing',
	'get', 'gets', 'got', 'gotten', 'getting')

	"""
	Limitations of this script:

	(1) The guidelines state:

	Some uses of NFP (for lines of hyphens, asterisks or tildes) -> PUNCT
	NFP (except for lines of separators, which become PUNCT) -> SYM

	However, we ignore the technicality about lines of separators, presuming they will not
	occur in the input.

	(2) The guidelines prescribe AUX for

	verbal tags (VB, VBP, VBG, VBN, VBD, VBZ) when they are forms of be, have, do, and get
	when used as an auxiliary (we count passive get as an auxiliary)

	We operationalize this by looking for these verb forms with a subsequent VP sister.
	This is a fairly robust heuristic, even capturing fronted (topicalized) VPs
	thanks to the treebanking convention of including a second VP with a trace:

	(SINV (VP-TPC-1 (VBG Sailing)
	(PP (IN with)
	(NP (DT the) (NNP Roosevelt))))
	(VP (VBZ is)
	(VP-1 (-NONE- T)))
	(NP-SBJ (NP (DT the) (NNP Tarawa) (NNP Expeditionary) (NNP Strike) (NNP Force))))))

	Depending on what was intended by the guidelines, this rule may not capture everything.
	Here is a tree fragment from the English Web Treebank where "have" and "get"
	could arguably be considered auxiliaries, but they are followed by an S constituent:

	you 'll have * to fight PRO to get it resolved *

	(NP-SBJ-2 (PRP you))
	(VP (MD 'll)
	(VP (VB have)
	(S (NP-SBJ-2 (-NONE- *))
	(VP (TO to)
	(VP (VB fight)
	(S-PRP (NP-SBJ-2 (-NONE- PRO))
	(VP (TO to)
	(VP (VB get)
	(S (NP-SBJ-3 (PRP it))
	(VP (VBN resolved)
	(NP-3 (-NONE- *))))))))))))
	"""

	def unitags(tree):
	penntags = [(m.group(2), m.group(1), (m.start(0), m.end(0))) \
	for m in re.finditer(r'\(([^\s\(\)]+) ([^\s\(\)]+)\)', tree) if m.group(1)!='-NONE-']
	result = []
	for (w, t, (i,j)) in penntags:
	u = EASY_MAP.get(t)
	if not u:
	if t in ('NN','NNS'): # NOUN : all cases of PTB NN and NNS, except for %, which we retag as SYM.
	if w=='%': u = 'SYM'
	else: u = 'NOUN'
	elif t in ('RB','RBR','RBS','WRB'): # ADV : all uses of PTB tags RB, RBR, RBS, and WRB except the clausal negation not and reduced forms of it, which become PART.
	if w.lower() in ('not', "n't"): u = 'PART'
	else: u = 'ADV'
	elif t=='IN': # SCONJ if complementizer or subordinating conjunction, ADP o.w.
	# Diagnostic for SCONJ: ook for an S* constituent starting immediately
	# after this word. (May not be perfect, but should be good enough)
	if tree[j+1:].strip().startswith('(S'): u = 'SCONJ'
	else: u = 'ADP'
	else:
	assert t in ('VB','VBP','VBG','VBN','VBD','VBZ'),(w,t)

	# read the sentence in as a stack (ignoring terminals)
	s = tree
	k = 0
	target_stack_depth = None
	right_sister_vp = False
	right_sister_s = False
	stack = []
	while s:
	first, s = s.split(' ', 1)
	k += len(first)+1
	if first.startswith('('): # push
	stack.append(first)
	if target_stack_depth is not None and len(stack)==target_stack_depth:
	# pushed the target word or one of its right sisters
	if first.startswith('(VP'):
	right_sister_vp = True
	elif first.startswith('(S') and k-len(first)-1==j+1:
	right_sister_s = True # IMMEDIATE right sister: e.g., (VBD had) (S ... (VP (TO to) ...
	else: #terminal or close paren. pop
	stack = stack[:-1]
	if target_stack_depth is not None and len(stack)<target_stack_depth-1:
	# we've popped the parent of the target constit
	break
	if i==k:
	target_stack_depth = len(stack)+1
	# our target word is ready to be pushed

	assert target_stack_depth is not None

	u = 'VERB'
	if w.lower() in AUX_FORMS:
	if right_sister_vp:
	u = 'AUX'
	#elif right_sister_s:
	# u += '~S'
	#u += '^' + str(target_stack_depth)
	result.append((w,t,u))
	return result

	c = Counter()
	for ln in fileinput.input():
	if not ln.strip(): continue
	sentid, tree = ln.split('\t')
	tree = tree.replace(')', ') ').replace(' ', ' ').strip()+' '
	uu = unitags(tree) # space out the close parens
	for w,t,u in uu:
	c[t,u] += 1
	print(sentid, ' '.join((w+'\|'+u if INLINE_TOK else u) for w,t,u in uu), sep='\t')
	print(c, file=sys.stderr)