nschneid/supersenseDefaults.py

## supersenseDefaults.py
#coding=UTF-8

'''
to run the code:

METHOD 1: .stem_pos files
$ export PYTHONPATH=/path/to/AQMAR
$ python2.7 supersenseDefaults.py [mode] ar.stem_pos > ar.lexiconsst

METHOD 2: parallel .tok and .wd_pos_ne.txt files
$ export PYTHONPATH=/path/to/AQMAR
$ python2.7 supersenseDefaults.py [mode] ../tokanFiles/dev/*.tok > ../tokanFiles/dev/dev.lexiconsst

where 'mode' is a comma-separated list of parts of speech to be considered,
of 'noun', 'verb', 'adj', and 'adv'. The default is "noun,verb".

Compute coverage statistics for Arabic Wikipedia articles given named entity annotations and Arabic WordNet.
Output for each nominal token a supersense selected from AWN and the class corresponding to the NE annotation.
The ultimate goal is to provide defaults for our annotators where possible.

TODO: OntoNotes entities with digits

@author: Nathan Schneider (nschneid)
@since: 2011-10-16
'''
from __future__ import print_function, division

import sys, codecs, fileinput, re, json, os
from glob import glob
from collections import Counter, defaultdict
from itertools import izip_longest

from edu.cmu.cs.lti.ark.pyutil.corpus.patb.patb import romanize

def czip(*iterables):
    '''
    Checked version of izip() that requires all arguments be of the same length.

    >>> list(czip('ABC','xyz'))
    [('A', 'x'), ('B', 'y'), ('C', 'z')]
    >>> list(czip('ABC','xy'))
    Traceback (most recent call last):
      ...
    ValueError: czip() arguments have unequal length
    >>> list(czip('AB','xyz'))
    Traceback (most recent call last):
      ...
    ValueError: czip() arguments have unequal length
    >>> list(czip('AB',[],'xy'))
    Traceback (most recent call last):
      ...
    ValueError: czip() arguments have unequal length
    '''
    SHORT=object()
    for y in izip_longest(*iterables, fillvalue=SHORT):
        if SHORT in y:
            raise ValueError('czip() arguments have unequal length')
        else:
            yield y


def normalize(d, by=None):
    '''
    Returns a copy of 'd', but with all values divided by the provided value.
    If that value is None, their sum is used.

    >>> c = normalize(Counter({9: 3, 's': 1, 8: 6}))
    >>> c==Counter({9: 0.3, 's': 0.1, 8: 0.6})
    True
    >>> c = normalize(Counter({9: 3, 's': 12, 8: 6}), 3)
    >>> c==Counter({9: 1, 's': 4, 8: 2})
    True
    '''
    if by is None:
        by = sum(d.values())
    return applyToValues((lambda v: v/by), d)

def prop(x, y):
    return '{}/{} = {:.2%}'.format(x,y,x/y)


# Map from OntoNotes entity classes to supersenses
# (some of these mappings are imperfect, but they're good enough for defaults).
ONTONOTES_ENTITY_S = '''
    CARDINAL      noun.QUANTITY
    DATE          noun.TIME
    EVENT         noun.EVENT
    FAC           noun.LOCATION
    GPE           noun.LOCATION
    LANGUAGE      noun.COMMUNICATION
    LAW           noun.COMMUNICATION
    LOC           noun.LOCATION
    MONEY         noun.QUANTITY
    NORP          noun.GROUP
    ORDINAL       noun.QUANTITY
    ORG           noun.GROUP
    PERCENT       noun.QUANTITY
    PERSON        noun.PERSON
    PRODUCT       noun.ARTIFACT
    QUANTITY      noun.QUANTITY
    TIME          noun.TIME
    WORK_OF_ART   noun.COMMUNICATION
'''.strip()

ONTONOTES_ENTITY_MAP = dict(ln.strip().split() for ln in ONTONOTES_ENTITY_S.split('\n'))

MODE = {'noun', 'verb'} # default
if set(sys.argv[1].split(','))<={'noun','verb','adj','adv'}:
    MODE = set(sys.argv[1].split(','))
    del sys.argv[1]

# load lexicon
arsenses = defaultdict(lambda: defaultdict(set))

nMultiwordAWNLexemes = 0
with codecs.open('awnLexicon4SS.txt', 'r', 'utf-8') as lexF:
    for ln in lexF:
        info = ln[:-1].split()

        wU, awnSynset, wnSynset, sst = info[:4]
        if len(info)<7: # no SST?
            print('No SST found for lexical entry:',info[-1],file=sys.stderr)
            continue
        assert sst.startswith('noun.') or sst.startswith('verb.') or sst.startswith('adj.') or sst.startswith('adv.'),'Invalid SST: {}'.format(sst)
        if sst.startswith('adj.') or sst.startswith('adv.'):
            continue

        if re.match(r'.+_([vnar])\d+AR$', awnSynset).group(1) not in ''.join(m[0] for m in MODE):
            continue
        wR = romanize(wU)
        if wR.count('_')>0: nMultiwordAWNLexemes += 1
        arsenses[wR][sst].add(awnSynset)


print('Multiword lexemes from AWN:',prop(nMultiwordAWNLexemes, len(arsenses)), file=sys.stderr)

if 'noun' in MODE:
    onEntities = defaultdict(Counter)
    nMultiwordONLexemes = 0
    with codecs.open('ontonotes-4.0-entities.txt', 'r', 'utf-8') as neF:
        for ln in neF:
            wU, onNE = ln[:-1].replace(' ','_').split('\t')

            if re.search(r'\d', wU) is not None: continue   # for now, ignore entities with digits

            wR = romanize(wU)
            sst = ONTONOTES_ENTITY_MAP[onNE]
            onEntities[wR][sst] += 1
            if '_' in wR:
                onEntities[wR.split('_')[0]][sst] += 1  # for multiword NEs, also store the SST for first word of the entity in case it occurs with similar names

    for wR in onEntities:
        if '_' in wR: nMultiwordONLexemes += 1
        for r,(sst,c) in enumerate(onEntities[wR].most_common()):
            arsenses[wR][sst].add('**ON**_n{}AR'.format(r+1))

    print('Multiword lexemes from OntoNotes:',prop(nMultiwordONLexemes, len(onEntities)), file=sys.stderr)


suffixes = defaultdict(set) # word mapped to set of full lexemes starting with that word
for entry in arsenses:
    words = entry.split('_')
    suffixes[words[0]].add(entry)


def rank(synsetId):
    return int(re.match(r'.+_[vnar](\d+)AR$', synsetId).group(1))

def voting(w):
    # voting procedure for ambiguous entries:
    # known supersenses are scored based on the number and rank of supporting synsets
    # normalized by the total number of synsets corresponding to this word
    result = Counter({sst: (len(syns)/min(rank(syn) for syn in syns)) for sst,syns in arsenses[w].items()})
    return normalize(result)


# load Wikipedia article tokens

ww = [] # vocalized, romanized stem from MADA (wa+ words discarded) or None if a Latin word, as well as newlines separating sentences
poses = []
tkns = []
nes = []

if sys.argv[1].endswith('.stem_pos'):   # METHOD 1
    for ln in fileinput.input():
        ln = ln.decode('utf-8')[:-1]
        if not ln.strip():
            ww.append('\n')
            poses.append('\n')
            tkns.append('\n')
            nes.append('\n')
        else:
            stem, pos = ln.split('\t')
            ww.append(None if stem.startswith('@@LAT@@') else stem)
            poses.append(pos)
            tkns.append(None)
            nes.append(None)
else:   # METHOD 2
    skipping = False
    for ln in fileinput.input(None if len(sys.argv)>1 else glob('4Articles-MADA/*.mada.tok')):
        # 'ln' contains a romanized sentence, with three versions of each token (the third of which is the vocalized lemma/stem): e.g. Al<ydrwjyn·AlAydrwjyn·<iydruwjiyn

        if fileinput.isfirstline():
            skipping = False
            postaggedFP = '../'+os.path.basename(fileinput.filename()).replace('Sent.','.').replace('.cleaned.txt.sent.bw.mada.tok','.wd_pos_ne.txt').replace('.txt.bw.mada.tok','.wd_pos_ne.txt')
            if not os.path.exists(postaggedFP):
                print('File not found: ', postaggedFP, file=sys.stderr)
                skipping = True
                continue
            postaggedF = codecs.open(postaggedFP, 'r', 'utf-8')
        elif skipping:
            continue    # corresponding file not found

        if ln=='\n': continue


        posLn = next(postaggedF)
        # 'posLn' contains word, POS, and NE annotation for the sentence
        posLn = posLn.replace(u'\u00a0','') # remove nonbreaking space sometimes inserted by POS tagger
        posTkns = iter(posLn.strip().split())
        ln = ln.decode('utf-8')
        for tkn in ln.strip().split():
            if tkn.endswith('+'):   # e.g., conjunction (separated in MADA/TOKAN output only)
                continue
            tkns.append(tkn)
            posTkn = next(posTkns)
            w0, pos, ne = posTkn.split('___')
            ne = '' if ne=='O' else ne.replace('B-','')[0].replace('O','G').replace('I','<')

            poses.append(pos)
            nes.append(ne)
            if tkn.startswith('@@LAT@@'):
                ww.append(None)
                continue  # ignore Latin characters in input
            assert tkn.count(u'·')==2,tkn.encode('utf-8')
            x, y, w = tkn.split(u'·')
            ww.append(w)    # lemma/stem from MADA
        ww.append('\n')
        poses.append('\n')
        tkns.append('\n')
        nes.append('\n')

# process Wikipedia articles
nMultiwordInstances = 0
nAmbigTkns = 0
nOOV = 0
nCoveredNE = 0
nCovered = 0
n = 0
waysAmbiguous = Counter()

#awnSSTs = []
#sstScores = {}  # index of first word matching a lexicon entry -> Counter with scores for each SST
#minRankSSTs = {}


LEGAL_POSES = set()
if 'noun' in MODE:
    LEGAL_POSES.update({'noun', 'pron', 'abbrev'})
if 'verb' in MODE:
    LEGAL_POSES.update({'verb'})

def sstLookup():
    global n, nCovered, nCoveredNE, nMultiwordInstances, nAmbigTkns, nOOV, waysAmbiguous

    i = 0
    while i<len(ww):    # populate awnSSTs with supersense tags, preferring longer lexemes over shorter ones
        #assert len(awnSSTs)==i,(i,len(awnSSTs))

        w = ww[i]
        #print(w, end=' ')

        if w=='\n':
            yield '\n',None,None
            i += 1
            continue
        if w is None:   # latin
            yield '-',None,None
            i += 1
            continue

        pos = poses[i]
        ne = nes[i]


        relevant = False
        for relevantPOS in LEGAL_POSES:
            if relevantPOS in pos:
                relevant = True
                break
        if not relevant:
            yield '-',None,None
            i += 1
            continue

        n += 1

        if w not in suffixes:
            nOOV += 1
            waysAmbiguous[0] += 1
            nCovered += int(ne!='')
            nCoveredNE += int(ne!='')
            yield '_',None,None
            i += 1
            continue

        matched = False
        for entry in sorted(suffixes[w], key=lambda ent: ent.count('_'), reverse=True):
            entryWords = entry.split('_')
            k = len(entryWords)
            #print(entryWords,ww[i:i+len(entryWords)])
            if entryWords[1:]==ww[i+1:i+k]: # match!
                sstOpts = voting(entry)  # score the possible tags

                # nschneid 2012-04-30: the following line was looking up arsenses[w] and arsenses[w][sst], which I think was a bug
                minRankSST = min(((sst,rank(syn)) for sst in arsenses[entry] for syn in arsenses[entry][sst]), key=lambda x:x[1])

                sstChoice = '_'
                waysAmbiguous[len(sstOpts)] += 1
                if len(sstOpts)>1:
                    nAmbigTkns += 1
                    (a,ascore),(b,bscore) = sstOpts.most_common(2)
                    if ascore>bscore:   # otherwise (a tie), punt
                        sstChoice = a
                else:
                    sstChoice = sstOpts.most_common(1)[0][0]

                if k>1: nMultiwordInstances += 1
                for x in [sstChoice]+list('<'*(k-1)):
                    yield x,sstOpts,minRankSST

                if sstChoice!='_':
                    nCovered += len(entryWords) # (though some words might not be nouns or verbs)
                else:
                    nCovered += sum(1 for ne in nes[i:i+k] if ne!='')
                nCoveredNE += sum(1 for ne in nes[i:i+k] if ne!='')

                i += k
                n += k-1
                matched = True
                break

        if not matched: # some multiword entries exist such the first word is not a separate entry
            yield '_',None,None
            i += 1


iSent = 0
for i,(w,(sst,sstScores,minRankSST),ne) in enumerate(czip(ww,sstLookup(),nes)):
    if w=='\n':
        print()
        if iSent%10000==0:
            print('sentence',iSent, file=sys.stderr)
        iSent += 1
    else:
        #print(sst+(ne and ','+ne), end=' ')    # heuristically-selected SST and gold NE
        mentionPosition = 'I' if sst=='<' else ('O' if sstScores is None else 'B')
        print(json.dumps({k.lower():v for k,v in sstScores.items()})+'\t'+minRankSST[0].lower()+'\t'+mentionPosition if sstScores is not None else sst) # normalized SST scores for the first token of in-vocabulary items; -, _, or < for other tokens


# Summary information

print('Multiword:', nMultiwordInstances, file=sys.stderr)
print('OOV:',prop(nOOV,n), file=sys.stderr)
print('Ways ambiguous (fraction of tokens):',normalize(waysAmbiguous,n), file=sys.stderr)
print('{}s covered by NE annotations:'.format(MODE),prop(nCoveredNE,n), file=sys.stderr)
print('{}s covered by NE annotations and/or AWN/OntoNotes NEs:'.format(MODE),prop(nCovered,n), file=sys.stderr)

'''Summary output:

(original)

10510
OOV: 3207/5368 = 59.74%
Ways ambiguous (fraction of tokens): Counter({0: 0.5974292101341282, 1: 0.19411326378539492, 2: 0.0868107302533532, 3: 0.05048435171385991, 4: 0.042846497764530554, 5: 0.010432190760059613, 6: 0.009314456035767511, 7: 0.005216095380029807, 9: 0.0027943368107302535, 8: 0.0005588673621460507})
nouns covered by NE annotations: 1178/5368 = 21.94%
nouns covered by NE annotations and/or AWN: 2307/5368 = 42.98%

3216
OOV: 1011/1106 = 91.41%
Ways ambiguous (fraction of tokens): Counter({0: 0.9141048824593129, 1: 0.045207956600361664, 2: 0.024412296564195298, 3: 0.007233273056057866, 4: 0.0054249547920434, 7: 0.003616636528028933})
verbs covered by NE annotations: 82/1106 = 7.41%
verbs covered by NE annotations and/or AWN: 144/1106 = 13.02%

(with MW entries)

Multiword: 16
OOV: 3062/5370 = 57.02%
Ways ambiguous (fraction of tokens): Counter({0: 0.5702048417132216, 1: 0.19646182495344505, 2: 0.08696461824953446, 3: 0.049720670391061456, 4: 0.03910614525139665, 5: 0.010428305400372439, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
nouns covered by NE annotations: 1171/5370 = 21.81%
nouns covered by NE annotations and/or AWN: 2879/5370 = 53.61%

(with OntoNotes entities)

Multiword lexemes from AWN: 3681/10510 = 35.02%
Multiword lexemes from OntoNotes: 6890/12604 = 54.67%
OOV: 2610/5370 = 48.60%
Ways ambiguous (fraction of tokens): Counter({0: 0.4860335195530726, 1: 0.2595903165735568, 2: 0.09757914338919925, 3: 0.05176908752327747, 4: 0.03910614525139665, 5: 0.013221601489757914, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
nouns covered by NE annotations: 1172/5370 = 21.82%
nouns covered by NE annotations and/or AWN/OntoNotes NEs: 2983/5370 = 55.55%

'''
	#coding=UTF-8

	'''
	to run the code:

	METHOD 1: .stem_pos files
	$ export PYTHONPATH=/path/to/AQMAR
	$ python2.7 supersenseDefaults.py [mode] ar.stem_pos > ar.lexiconsst

	METHOD 2: parallel .tok and .wd_pos_ne.txt files
	$ export PYTHONPATH=/path/to/AQMAR
	$ python2.7 supersenseDefaults.py [mode] ../tokanFiles/dev/*.tok > ../tokanFiles/dev/dev.lexiconsst

	where 'mode' is a comma-separated list of parts of speech to be considered,
	of 'noun', 'verb', 'adj', and 'adv'. The default is "noun,verb".

	Compute coverage statistics for Arabic Wikipedia articles given named entity annotations and Arabic WordNet.
	Output for each nominal token a supersense selected from AWN and the class corresponding to the NE annotation.
	The ultimate goal is to provide defaults for our annotators where possible.

	TODO: OntoNotes entities with digits

	@author: Nathan Schneider (nschneid)
	@since: 2011-10-16
	'''
	from __future__ import print_function, division

	import sys, codecs, fileinput, re, json, os
	from glob import glob
	from collections import Counter, defaultdict
	from itertools import izip_longest

	from edu.cmu.cs.lti.ark.pyutil.corpus.patb.patb import romanize

	def czip(*iterables):
	'''
	Checked version of izip() that requires all arguments be of the same length.

	>>> list(czip('ABC','xyz'))
	[('A', 'x'), ('B', 'y'), ('C', 'z')]
	>>> list(czip('ABC','xy'))
	Traceback (most recent call last):
	...
	ValueError: czip() arguments have unequal length
	>>> list(czip('AB','xyz'))
	Traceback (most recent call last):
	...
	ValueError: czip() arguments have unequal length
	>>> list(czip('AB',[],'xy'))
	Traceback (most recent call last):
	...
	ValueError: czip() arguments have unequal length
	'''
	SHORT=object()
	for y in izip_longest(*iterables, fillvalue=SHORT):
	if SHORT in y:
	raise ValueError('czip() arguments have unequal length')
	else:
	yield y


	def normalize(d, by=None):
	'''
	Returns a copy of 'd', but with all values divided by the provided value.
	If that value is None, their sum is used.

	>>> c = normalize(Counter({9: 3, 's': 1, 8: 6}))
	>>> c==Counter({9: 0.3, 's': 0.1, 8: 0.6})
	True
	>>> c = normalize(Counter({9: 3, 's': 12, 8: 6}), 3)
	>>> c==Counter({9: 1, 's': 4, 8: 2})
	True
	'''
	if by is None:
	by = sum(d.values())
	return applyToValues((lambda v: v/by), d)

	def prop(x, y):
	return '{}/{} = {:.2%}'.format(x,y,x/y)


	# Map from OntoNotes entity classes to supersenses
	# (some of these mappings are imperfect, but they're good enough for defaults).
	ONTONOTES_ENTITY_S = '''
	CARDINAL noun.QUANTITY
	DATE noun.TIME
	EVENT noun.EVENT
	FAC noun.LOCATION
	GPE noun.LOCATION
	LANGUAGE noun.COMMUNICATION
	LAW noun.COMMUNICATION
	LOC noun.LOCATION
	MONEY noun.QUANTITY
	NORP noun.GROUP
	ORDINAL noun.QUANTITY
	ORG noun.GROUP
	PERCENT noun.QUANTITY
	PERSON noun.PERSON
	PRODUCT noun.ARTIFACT
	QUANTITY noun.QUANTITY
	TIME noun.TIME
	WORK_OF_ART noun.COMMUNICATION
	'''.strip()

	ONTONOTES_ENTITY_MAP = dict(ln.strip().split() for ln in ONTONOTES_ENTITY_S.split('\n'))

	MODE = {'noun', 'verb'} # default
	if set(sys.argv[1].split(','))<={'noun','verb','adj','adv'}:
	MODE = set(sys.argv[1].split(','))
	del sys.argv[1]

	# load lexicon
	arsenses = defaultdict(lambda: defaultdict(set))

	nMultiwordAWNLexemes = 0
	with codecs.open('awnLexicon4SS.txt', 'r', 'utf-8') as lexF:
	for ln in lexF:
	info = ln[:-1].split()

	wU, awnSynset, wnSynset, sst = info[:4]
	if len(info)<7: # no SST?
	print('No SST found for lexical entry:',info[-1],file=sys.stderr)
	continue
	assert sst.startswith('noun.') or sst.startswith('verb.') or sst.startswith('adj.') or sst.startswith('adv.'),'Invalid SST: {}'.format(sst)
	if sst.startswith('adj.') or sst.startswith('adv.'):
	continue

	if re.match(r'.+_([vnar])\d+AR$', awnSynset).group(1) not in ''.join(m[0] for m in MODE):
	continue
	wR = romanize(wU)
	if wR.count('_')>0: nMultiwordAWNLexemes += 1
	arsenses[wR][sst].add(awnSynset)



	print('Multiword lexemes from AWN:',prop(nMultiwordAWNLexemes, len(arsenses)), file=sys.stderr)

	if 'noun' in MODE:
	onEntities = defaultdict(Counter)
	nMultiwordONLexemes = 0
	with codecs.open('ontonotes-4.0-entities.txt', 'r', 'utf-8') as neF:
	for ln in neF:
	wU, onNE = ln[:-1].replace(' ','_').split('\t')

	if re.search(r'\d', wU) is not None: continue # for now, ignore entities with digits

	wR = romanize(wU)
	sst = ONTONOTES_ENTITY_MAP[onNE]
	onEntities[wR][sst] += 1
	if '_' in wR:
	onEntities[wR.split('_')[0]][sst] += 1 # for multiword NEs, also store the SST for first word of the entity in case it occurs with similar names

	for wR in onEntities:
	if '_' in wR: nMultiwordONLexemes += 1
	for r,(sst,c) in enumerate(onEntities[wR].most_common()):
	arsenses[wR][sst].add('ON_n{}AR'.format(r+1))

	print('Multiword lexemes from OntoNotes:',prop(nMultiwordONLexemes, len(onEntities)), file=sys.stderr)


	suffixes = defaultdict(set) # word mapped to set of full lexemes starting with that word
	for entry in arsenses:
	words = entry.split('_')
	suffixes[words[0]].add(entry)





	def rank(synsetId):
	return int(re.match(r'.+_[vnar](\d+)AR$', synsetId).group(1))

	def voting(w):
	# voting procedure for ambiguous entries:
	# known supersenses are scored based on the number and rank of supporting synsets
	# normalized by the total number of synsets corresponding to this word
	result = Counter({sst: (len(syns)/min(rank(syn) for syn in syns)) for sst,syns in arsenses[w].items()})
	return normalize(result)


	# load Wikipedia article tokens

	ww = [] # vocalized, romanized stem from MADA (wa+ words discarded) or None if a Latin word, as well as newlines separating sentences
	poses = []
	tkns = []
	nes = []

	if sys.argv[1].endswith('.stem_pos'): # METHOD 1
	for ln in fileinput.input():
	ln = ln.decode('utf-8')[:-1]
	if not ln.strip():
	ww.append('\n')
	poses.append('\n')
	tkns.append('\n')
	nes.append('\n')
	else:
	stem, pos = ln.split('\t')
	ww.append(None if stem.startswith('@@LAT@@') else stem)
	poses.append(pos)
	tkns.append(None)
	nes.append(None)
	else: # METHOD 2
	skipping = False
	for ln in fileinput.input(None if len(sys.argv)>1 else glob('4Articles-MADA/*.mada.tok')):
	# 'ln' contains a romanized sentence, with three versions of each token (the third of which is the vocalized lemma/stem): e.g. Al<ydrwjyn·AlAydrwjyn·<iydruwjiyn

	if fileinput.isfirstline():
	skipping = False
	postaggedFP = '../'+os.path.basename(fileinput.filename()).replace('Sent.','.').replace('.cleaned.txt.sent.bw.mada.tok','.wd_pos_ne.txt').replace('.txt.bw.mada.tok','.wd_pos_ne.txt')
	if not os.path.exists(postaggedFP):
	print('File not found: ', postaggedFP, file=sys.stderr)
	skipping = True
	continue
	postaggedF = codecs.open(postaggedFP, 'r', 'utf-8')
	elif skipping:
	continue # corresponding file not found

	if ln=='\n': continue


	posLn = next(postaggedF)
	# 'posLn' contains word, POS, and NE annotation for the sentence
	posLn = posLn.replace(u'\u00a0','') # remove nonbreaking space sometimes inserted by POS tagger
	posTkns = iter(posLn.strip().split())
	ln = ln.decode('utf-8')
	for tkn in ln.strip().split():
	if tkn.endswith('+'): # e.g., conjunction (separated in MADA/TOKAN output only)
	continue
	tkns.append(tkn)
	posTkn = next(posTkns)
	w0, pos, ne = posTkn.split('___')
	ne = '' if ne=='O' else ne.replace('B-','')[0].replace('O','G').replace('I','<')

	poses.append(pos)
	nes.append(ne)
	if tkn.startswith('@@LAT@@'):
	ww.append(None)
	continue # ignore Latin characters in input
	assert tkn.count(u'·')==2,tkn.encode('utf-8')
	x, y, w = tkn.split(u'·')
	ww.append(w) # lemma/stem from MADA
	ww.append('\n')
	poses.append('\n')
	tkns.append('\n')
	nes.append('\n')

	# process Wikipedia articles
	nMultiwordInstances = 0
	nAmbigTkns = 0
	nOOV = 0
	nCoveredNE = 0
	nCovered = 0
	n = 0
	waysAmbiguous = Counter()

	#awnSSTs = []
	#sstScores = {} # index of first word matching a lexicon entry -> Counter with scores for each SST
	#minRankSSTs = {}


	LEGAL_POSES = set()
	if 'noun' in MODE:
	LEGAL_POSES.update({'noun', 'pron', 'abbrev'})
	if 'verb' in MODE:
	LEGAL_POSES.update({'verb'})

	def sstLookup():
	global n, nCovered, nCoveredNE, nMultiwordInstances, nAmbigTkns, nOOV, waysAmbiguous

	i = 0
	while i<len(ww): # populate awnSSTs with supersense tags, preferring longer lexemes over shorter ones
	#assert len(awnSSTs)==i,(i,len(awnSSTs))

	w = ww[i]
	#print(w, end=' ')

	if w=='\n':
	yield '\n',None,None
	i += 1
	continue
	if w is None: # latin
	yield '-',None,None
	i += 1
	continue

	pos = poses[i]
	ne = nes[i]


	relevant = False
	for relevantPOS in LEGAL_POSES:
	if relevantPOS in pos:
	relevant = True
	break
	if not relevant:
	yield '-',None,None
	i += 1
	continue

	n += 1

	if w not in suffixes:
	nOOV += 1
	waysAmbiguous[0] += 1
	nCovered += int(ne!='')
	nCoveredNE += int(ne!='')
	yield '_',None,None
	i += 1
	continue

	matched = False
	for entry in sorted(suffixes[w], key=lambda ent: ent.count('_'), reverse=True):
	entryWords = entry.split('_')
	k = len(entryWords)
	#print(entryWords,ww[i:i+len(entryWords)])
	if entryWords[1:]==ww[i+1:i+k]: # match!
	sstOpts = voting(entry) # score the possible tags

	# nschneid 2012-04-30: the following line was looking up arsenses[w] and arsenses[w][sst], which I think was a bug
	minRankSST = min(((sst,rank(syn)) for sst in arsenses[entry] for syn in arsenses[entry][sst]), key=lambda x:x[1])

	sstChoice = '_'
	waysAmbiguous[len(sstOpts)] += 1
	if len(sstOpts)>1:
	nAmbigTkns += 1
	(a,ascore),(b,bscore) = sstOpts.most_common(2)
	if ascore>bscore: # otherwise (a tie), punt
	sstChoice = a
	else:
	sstChoice = sstOpts.most_common(1)[0][0]

	if k>1: nMultiwordInstances += 1
	for x in [sstChoice]+list('<'*(k-1)):
	yield x,sstOpts,minRankSST

	if sstChoice!='_':
	nCovered += len(entryWords) # (though some words might not be nouns or verbs)
	else:
	nCovered += sum(1 for ne in nes[i:i+k] if ne!='')
	nCoveredNE += sum(1 for ne in nes[i:i+k] if ne!='')

	i += k
	n += k-1
	matched = True
	break

	if not matched: # some multiword entries exist such the first word is not a separate entry
	yield '_',None,None
	i += 1


	iSent = 0
	for i,(w,(sst,sstScores,minRankSST),ne) in enumerate(czip(ww,sstLookup(),nes)):
	if w=='\n':
	print()
	if iSent%10000==0:
	print('sentence',iSent, file=sys.stderr)
	iSent += 1
	else:
	#print(sst+(ne and ','+ne), end=' ') # heuristically-selected SST and gold NE
	mentionPosition = 'I' if sst=='<' else ('O' if sstScores is None else 'B')
	print(json.dumps({k.lower():v for k,v in sstScores.items()})+'\t'+minRankSST[0].lower()+'\t'+mentionPosition if sstScores is not None else sst) # normalized SST scores for the first token of in-vocabulary items; -, _, or < for other tokens



	# Summary information

	print('Multiword:', nMultiwordInstances, file=sys.stderr)
	print('OOV:',prop(nOOV,n), file=sys.stderr)
	print('Ways ambiguous (fraction of tokens):',normalize(waysAmbiguous,n), file=sys.stderr)
	print('{}s covered by NE annotations:'.format(MODE),prop(nCoveredNE,n), file=sys.stderr)
	print('{}s covered by NE annotations and/or AWN/OntoNotes NEs:'.format(MODE),prop(nCovered,n), file=sys.stderr)

	'''Summary output:

	(original)

	10510
	OOV: 3207/5368 = 59.74%
	Ways ambiguous (fraction of tokens): Counter({0: 0.5974292101341282, 1: 0.19411326378539492, 2: 0.0868107302533532, 3: 0.05048435171385991, 4: 0.042846497764530554, 5: 0.010432190760059613, 6: 0.009314456035767511, 7: 0.005216095380029807, 9: 0.0027943368107302535, 8: 0.0005588673621460507})
	nouns covered by NE annotations: 1178/5368 = 21.94%
	nouns covered by NE annotations and/or AWN: 2307/5368 = 42.98%

	3216
	OOV: 1011/1106 = 91.41%
	Ways ambiguous (fraction of tokens): Counter({0: 0.9141048824593129, 1: 0.045207956600361664, 2: 0.024412296564195298, 3: 0.007233273056057866, 4: 0.0054249547920434, 7: 0.003616636528028933})
	verbs covered by NE annotations: 82/1106 = 7.41%
	verbs covered by NE annotations and/or AWN: 144/1106 = 13.02%

	(with MW entries)

	Multiword: 16
	OOV: 3062/5370 = 57.02%
	Ways ambiguous (fraction of tokens): Counter({0: 0.5702048417132216, 1: 0.19646182495344505, 2: 0.08696461824953446, 3: 0.049720670391061456, 4: 0.03910614525139665, 5: 0.010428305400372439, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
	nouns covered by NE annotations: 1171/5370 = 21.81%
	nouns covered by NE annotations and/or AWN: 2879/5370 = 53.61%

	(with OntoNotes entities)

	Multiword lexemes from AWN: 3681/10510 = 35.02%
	Multiword lexemes from OntoNotes: 6890/12604 = 54.67%
	OOV: 2610/5370 = 48.60%
	Ways ambiguous (fraction of tokens): Counter({0: 0.4860335195530726, 1: 0.2595903165735568, 2: 0.09757914338919925, 3: 0.05176908752327747, 4: 0.03910614525139665, 5: 0.013221601489757914, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
	nouns covered by NE annotations: 1172/5370 = 21.82%
	nouns covered by NE annotations and/or AWN/OntoNotes NEs: 2983/5370 = 55.55%

	'''