Skip to content

Instantly share code, notes, and snippets.

@nschneid
Created February 26, 2014 21:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nschneid/9239450 to your computer and use it in GitHub Desktop.
Save nschneid/9239450 to your computer and use it in GitHub Desktop.
Script used to load Arabic supersense lexicons (from Arabic WordNet and OntoNotes) and list the possible matches for each token of an input text. One of the imports depends on code in https://github.com/nschneid/pyutil.
#coding=UTF-8
'''
to run the code:
METHOD 1: .stem_pos files
$ export PYTHONPATH=/path/to/AQMAR
$ python2.7 supersenseDefaults.py [mode] ar.stem_pos > ar.lexiconsst
METHOD 2: parallel .tok and .wd_pos_ne.txt files
$ export PYTHONPATH=/path/to/AQMAR
$ python2.7 supersenseDefaults.py [mode] ../tokanFiles/dev/*.tok > ../tokanFiles/dev/dev.lexiconsst
where 'mode' is a comma-separated list of parts of speech to be considered,
of 'noun', 'verb', 'adj', and 'adv'. The default is "noun,verb".
Compute coverage statistics for Arabic Wikipedia articles given named entity annotations and Arabic WordNet.
Output for each nominal token a supersense selected from AWN and the class corresponding to the NE annotation.
The ultimate goal is to provide defaults for our annotators where possible.
TODO: OntoNotes entities with digits
@author: Nathan Schneider (nschneid)
@since: 2011-10-16
'''
from __future__ import print_function, division
import sys, codecs, fileinput, re, json, os
from glob import glob
from collections import Counter, defaultdict
from itertools import izip_longest
from edu.cmu.cs.lti.ark.pyutil.corpus.patb.patb import romanize
def czip(*iterables):
'''
Checked version of izip() that requires all arguments be of the same length.
>>> list(czip('ABC','xyz'))
[('A', 'x'), ('B', 'y'), ('C', 'z')]
>>> list(czip('ABC','xy'))
Traceback (most recent call last):
...
ValueError: czip() arguments have unequal length
>>> list(czip('AB','xyz'))
Traceback (most recent call last):
...
ValueError: czip() arguments have unequal length
>>> list(czip('AB',[],'xy'))
Traceback (most recent call last):
...
ValueError: czip() arguments have unequal length
'''
SHORT=object()
for y in izip_longest(*iterables, fillvalue=SHORT):
if SHORT in y:
raise ValueError('czip() arguments have unequal length')
else:
yield y
def normalize(d, by=None):
'''
Returns a copy of 'd', but with all values divided by the provided value.
If that value is None, their sum is used.
>>> c = normalize(Counter({9: 3, 's': 1, 8: 6}))
>>> c==Counter({9: 0.3, 's': 0.1, 8: 0.6})
True
>>> c = normalize(Counter({9: 3, 's': 12, 8: 6}), 3)
>>> c==Counter({9: 1, 's': 4, 8: 2})
True
'''
if by is None:
by = sum(d.values())
return applyToValues((lambda v: v/by), d)
def prop(x, y):
return '{}/{} = {:.2%}'.format(x,y,x/y)
# Map from OntoNotes entity classes to supersenses
# (some of these mappings are imperfect, but they're good enough for defaults).
ONTONOTES_ENTITY_S = '''
CARDINAL noun.QUANTITY
DATE noun.TIME
EVENT noun.EVENT
FAC noun.LOCATION
GPE noun.LOCATION
LANGUAGE noun.COMMUNICATION
LAW noun.COMMUNICATION
LOC noun.LOCATION
MONEY noun.QUANTITY
NORP noun.GROUP
ORDINAL noun.QUANTITY
ORG noun.GROUP
PERCENT noun.QUANTITY
PERSON noun.PERSON
PRODUCT noun.ARTIFACT
QUANTITY noun.QUANTITY
TIME noun.TIME
WORK_OF_ART noun.COMMUNICATION
'''.strip()
ONTONOTES_ENTITY_MAP = dict(ln.strip().split() for ln in ONTONOTES_ENTITY_S.split('\n'))
MODE = {'noun', 'verb'} # default
if set(sys.argv[1].split(','))<={'noun','verb','adj','adv'}:
MODE = set(sys.argv[1].split(','))
del sys.argv[1]
# load lexicon
arsenses = defaultdict(lambda: defaultdict(set))
nMultiwordAWNLexemes = 0
with codecs.open('awnLexicon4SS.txt', 'r', 'utf-8') as lexF:
for ln in lexF:
info = ln[:-1].split()
wU, awnSynset, wnSynset, sst = info[:4]
if len(info)<7: # no SST?
print('No SST found for lexical entry:',info[-1],file=sys.stderr)
continue
assert sst.startswith('noun.') or sst.startswith('verb.') or sst.startswith('adj.') or sst.startswith('adv.'),'Invalid SST: {}'.format(sst)
if sst.startswith('adj.') or sst.startswith('adv.'):
continue
if re.match(r'.+_([vnar])\d+AR$', awnSynset).group(1) not in ''.join(m[0] for m in MODE):
continue
wR = romanize(wU)
if wR.count('_')>0: nMultiwordAWNLexemes += 1
arsenses[wR][sst].add(awnSynset)
print('Multiword lexemes from AWN:',prop(nMultiwordAWNLexemes, len(arsenses)), file=sys.stderr)
if 'noun' in MODE:
onEntities = defaultdict(Counter)
nMultiwordONLexemes = 0
with codecs.open('ontonotes-4.0-entities.txt', 'r', 'utf-8') as neF:
for ln in neF:
wU, onNE = ln[:-1].replace(' ','_').split('\t')
if re.search(r'\d', wU) is not None: continue # for now, ignore entities with digits
wR = romanize(wU)
sst = ONTONOTES_ENTITY_MAP[onNE]
onEntities[wR][sst] += 1
if '_' in wR:
onEntities[wR.split('_')[0]][sst] += 1 # for multiword NEs, also store the SST for first word of the entity in case it occurs with similar names
for wR in onEntities:
if '_' in wR: nMultiwordONLexemes += 1
for r,(sst,c) in enumerate(onEntities[wR].most_common()):
arsenses[wR][sst].add('**ON**_n{}AR'.format(r+1))
print('Multiword lexemes from OntoNotes:',prop(nMultiwordONLexemes, len(onEntities)), file=sys.stderr)
suffixes = defaultdict(set) # word mapped to set of full lexemes starting with that word
for entry in arsenses:
words = entry.split('_')
suffixes[words[0]].add(entry)
def rank(synsetId):
return int(re.match(r'.+_[vnar](\d+)AR$', synsetId).group(1))
def voting(w):
# voting procedure for ambiguous entries:
# known supersenses are scored based on the number and rank of supporting synsets
# normalized by the total number of synsets corresponding to this word
result = Counter({sst: (len(syns)/min(rank(syn) for syn in syns)) for sst,syns in arsenses[w].items()})
return normalize(result)
# load Wikipedia article tokens
ww = [] # vocalized, romanized stem from MADA (wa+ words discarded) or None if a Latin word, as well as newlines separating sentences
poses = []
tkns = []
nes = []
if sys.argv[1].endswith('.stem_pos'): # METHOD 1
for ln in fileinput.input():
ln = ln.decode('utf-8')[:-1]
if not ln.strip():
ww.append('\n')
poses.append('\n')
tkns.append('\n')
nes.append('\n')
else:
stem, pos = ln.split('\t')
ww.append(None if stem.startswith('@@LAT@@') else stem)
poses.append(pos)
tkns.append(None)
nes.append(None)
else: # METHOD 2
skipping = False
for ln in fileinput.input(None if len(sys.argv)>1 else glob('4Articles-MADA/*.mada.tok')):
# 'ln' contains a romanized sentence, with three versions of each token (the third of which is the vocalized lemma/stem): e.g. Al<ydrwjyn·AlAydrwjyn·<iydruwjiyn
if fileinput.isfirstline():
skipping = False
postaggedFP = '../'+os.path.basename(fileinput.filename()).replace('Sent.','.').replace('.cleaned.txt.sent.bw.mada.tok','.wd_pos_ne.txt').replace('.txt.bw.mada.tok','.wd_pos_ne.txt')
if not os.path.exists(postaggedFP):
print('File not found: ', postaggedFP, file=sys.stderr)
skipping = True
continue
postaggedF = codecs.open(postaggedFP, 'r', 'utf-8')
elif skipping:
continue # corresponding file not found
if ln=='\n': continue
posLn = next(postaggedF)
# 'posLn' contains word, POS, and NE annotation for the sentence
posLn = posLn.replace(u'\u00a0','') # remove nonbreaking space sometimes inserted by POS tagger
posTkns = iter(posLn.strip().split())
ln = ln.decode('utf-8')
for tkn in ln.strip().split():
if tkn.endswith('+'): # e.g., conjunction (separated in MADA/TOKAN output only)
continue
tkns.append(tkn)
posTkn = next(posTkns)
w0, pos, ne = posTkn.split('___')
ne = '' if ne=='O' else ne.replace('B-','')[0].replace('O','G').replace('I','<')
poses.append(pos)
nes.append(ne)
if tkn.startswith('@@LAT@@'):
ww.append(None)
continue # ignore Latin characters in input
assert tkn.count(u'·')==2,tkn.encode('utf-8')
x, y, w = tkn.split(u'·')
ww.append(w) # lemma/stem from MADA
ww.append('\n')
poses.append('\n')
tkns.append('\n')
nes.append('\n')
# process Wikipedia articles
nMultiwordInstances = 0
nAmbigTkns = 0
nOOV = 0
nCoveredNE = 0
nCovered = 0
n = 0
waysAmbiguous = Counter()
#awnSSTs = []
#sstScores = {} # index of first word matching a lexicon entry -> Counter with scores for each SST
#minRankSSTs = {}
LEGAL_POSES = set()
if 'noun' in MODE:
LEGAL_POSES.update({'noun', 'pron', 'abbrev'})
if 'verb' in MODE:
LEGAL_POSES.update({'verb'})
def sstLookup():
global n, nCovered, nCoveredNE, nMultiwordInstances, nAmbigTkns, nOOV, waysAmbiguous
i = 0
while i<len(ww): # populate awnSSTs with supersense tags, preferring longer lexemes over shorter ones
#assert len(awnSSTs)==i,(i,len(awnSSTs))
w = ww[i]
#print(w, end=' ')
if w=='\n':
yield '\n',None,None
i += 1
continue
if w is None: # latin
yield '-',None,None
i += 1
continue
pos = poses[i]
ne = nes[i]
relevant = False
for relevantPOS in LEGAL_POSES:
if relevantPOS in pos:
relevant = True
break
if not relevant:
yield '-',None,None
i += 1
continue
n += 1
if w not in suffixes:
nOOV += 1
waysAmbiguous[0] += 1
nCovered += int(ne!='')
nCoveredNE += int(ne!='')
yield '_',None,None
i += 1
continue
matched = False
for entry in sorted(suffixes[w], key=lambda ent: ent.count('_'), reverse=True):
entryWords = entry.split('_')
k = len(entryWords)
#print(entryWords,ww[i:i+len(entryWords)])
if entryWords[1:]==ww[i+1:i+k]: # match!
sstOpts = voting(entry) # score the possible tags
# nschneid 2012-04-30: the following line was looking up arsenses[w] and arsenses[w][sst], which I think was a bug
minRankSST = min(((sst,rank(syn)) for sst in arsenses[entry] for syn in arsenses[entry][sst]), key=lambda x:x[1])
sstChoice = '_'
waysAmbiguous[len(sstOpts)] += 1
if len(sstOpts)>1:
nAmbigTkns += 1
(a,ascore),(b,bscore) = sstOpts.most_common(2)
if ascore>bscore: # otherwise (a tie), punt
sstChoice = a
else:
sstChoice = sstOpts.most_common(1)[0][0]
if k>1: nMultiwordInstances += 1
for x in [sstChoice]+list('<'*(k-1)):
yield x,sstOpts,minRankSST
if sstChoice!='_':
nCovered += len(entryWords) # (though some words might not be nouns or verbs)
else:
nCovered += sum(1 for ne in nes[i:i+k] if ne!='')
nCoveredNE += sum(1 for ne in nes[i:i+k] if ne!='')
i += k
n += k-1
matched = True
break
if not matched: # some multiword entries exist such the first word is not a separate entry
yield '_',None,None
i += 1
iSent = 0
for i,(w,(sst,sstScores,minRankSST),ne) in enumerate(czip(ww,sstLookup(),nes)):
if w=='\n':
print()
if iSent%10000==0:
print('sentence',iSent, file=sys.stderr)
iSent += 1
else:
#print(sst+(ne and ','+ne), end=' ') # heuristically-selected SST and gold NE
mentionPosition = 'I' if sst=='<' else ('O' if sstScores is None else 'B')
print(json.dumps({k.lower():v for k,v in sstScores.items()})+'\t'+minRankSST[0].lower()+'\t'+mentionPosition if sstScores is not None else sst) # normalized SST scores for the first token of in-vocabulary items; -, _, or < for other tokens
# Summary information
print('Multiword:', nMultiwordInstances, file=sys.stderr)
print('OOV:',prop(nOOV,n), file=sys.stderr)
print('Ways ambiguous (fraction of tokens):',normalize(waysAmbiguous,n), file=sys.stderr)
print('{}s covered by NE annotations:'.format(MODE),prop(nCoveredNE,n), file=sys.stderr)
print('{}s covered by NE annotations and/or AWN/OntoNotes NEs:'.format(MODE),prop(nCovered,n), file=sys.stderr)
'''Summary output:
(original)
10510
OOV: 3207/5368 = 59.74%
Ways ambiguous (fraction of tokens): Counter({0: 0.5974292101341282, 1: 0.19411326378539492, 2: 0.0868107302533532, 3: 0.05048435171385991, 4: 0.042846497764530554, 5: 0.010432190760059613, 6: 0.009314456035767511, 7: 0.005216095380029807, 9: 0.0027943368107302535, 8: 0.0005588673621460507})
nouns covered by NE annotations: 1178/5368 = 21.94%
nouns covered by NE annotations and/or AWN: 2307/5368 = 42.98%
3216
OOV: 1011/1106 = 91.41%
Ways ambiguous (fraction of tokens): Counter({0: 0.9141048824593129, 1: 0.045207956600361664, 2: 0.024412296564195298, 3: 0.007233273056057866, 4: 0.0054249547920434, 7: 0.003616636528028933})
verbs covered by NE annotations: 82/1106 = 7.41%
verbs covered by NE annotations and/or AWN: 144/1106 = 13.02%
(with MW entries)
Multiword: 16
OOV: 3062/5370 = 57.02%
Ways ambiguous (fraction of tokens): Counter({0: 0.5702048417132216, 1: 0.19646182495344505, 2: 0.08696461824953446, 3: 0.049720670391061456, 4: 0.03910614525139665, 5: 0.010428305400372439, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
nouns covered by NE annotations: 1171/5370 = 21.81%
nouns covered by NE annotations and/or AWN: 2879/5370 = 53.61%
(with OntoNotes entities)
Multiword lexemes from AWN: 3681/10510 = 35.02%
Multiword lexemes from OntoNotes: 6890/12604 = 54.67%
OOV: 2610/5370 = 48.60%
Ways ambiguous (fraction of tokens): Counter({0: 0.4860335195530726, 1: 0.2595903165735568, 2: 0.09757914338919925, 3: 0.05176908752327747, 4: 0.03910614525139665, 5: 0.013221601489757914, 6: 0.00931098696461825, 7: 0.0052141527001862194, 9: 0.002793296089385475, 8: 0.00018621973929236498})
nouns covered by NE annotations: 1172/5370 = 21.82%
nouns covered by NE annotations and/or AWN/OntoNotes NEs: 2983/5370 = 55.55%
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment