Skip to content

Instantly share code, notes, and snippets.

@renaud
Created August 22, 2016 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save renaud/e91df8bff7c2b49c44480b934357ecaa to your computer and use it in GitHub Desktop.
Save renaud/e91df8bff7c2b49c44480b934357ecaa to your computer and use it in GitHub Desktop.
Link abbreviations to their full names Based on A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text A. Schwartz and M. Hearst Biocomputing, 2003, pp 451-462.
#!/usr/bin/env python
'''Link abbreviations to their full names
Based on
A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
A. Schwartz and M. Hearst
Biocomputing, 2003, pp 451-462.
# License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt
'''
__date__ = 'July 2012'
__author__ = 'Vincent Van Asch'
__version__ = '1.2.1'
# If you want to use the script that produces the results reported in __doc__
# you have to set the global __REPRODUCE__ to True. It is assumed that
# the extra code that is active when __REPRODUCE__ is set to True produces
# better results.
__REPRODUCE__=False
__doc__='''
This script takes tokenized sentences and prints the abbreviations and their definitions to STDOUT.
USAGE
python [-v] abbreviations.py inputfile > abbreviations
FORMATS
inputfile: A tokenized sentence on every new line (tokens should be separated by a single space)
abbreviations:
sentenceid1 characteroffset_begin_a1 characteroffset_end_a1 abbreviation1
sentenceid1 characteroffset_begin_d1 characteroffset_end_d1 definition1
sentenceid2 characteroffset_begin2 characteroffset_end2 abbreviation2
sentenceid2 characteroffset_begin2 characteroffset_end2 definition2
...
sentenceid: the line on which the sentence is in the inputfile minus one
OPTIONS
-v: Print more information to STDERR
REMARKS
The algorithm detects only links between an abbreviation and its definition if they
are in the format <definition> (<abbreviation>). So, the reverse
<abbreviation> (<definition>) is not detected.
The algorithm can only find definitions that have all characters of the abbreviation
in them.
It will also make errors in cases like "tyrosine transcription factor I (TTFI)"
the wrong definition will be "transcription factor I" because all characters from
the abbreviation are in the definition.
On the labeled yeast corpus (http://biotext.berkeley.edu/data.html) version 1.0 of this
script reaches:
TP: 673
FP: 94
FN: 280
P : 87.74
R : 70.62
F1: 78.26
(The corpus had to be corrected manually in order to be useable.)
ACKNOWLEDGEMENTS
Based on:
A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
A. Schwartz and M. Hearst
Biocomputing, 2003, pp 451-462.
%s (version %s)''' %(__date__, __version__)
import os, sys, re, getopt
encoding='UTF8'
def fread(fname, encoding=encoding):
f=open(os.path.abspath(os.path.expanduser(fname)))
try:
for l in f:
line=l.strip()
if line:
yield line.decode(encoding)
finally:
f.close()
def fread2(fname, encoding=encoding):
f=open(os.path.abspath(os.path.expanduser(fname)))
try:
for l in f:
line=l.strip().decode(encoding)
if line:
yield line.split()
else:
yield None
finally:
f.close()
class Candidate(unicode):
def __new__(cls, start, stop, str):
return unicode.__new__(cls, str)
def __init__(self, start, stop, str):
self._start = start
self._stop = stop
def __getslice__(self, i, j):
start = self.start+i
stop = self.start+j
str = unicode.__getslice__(self, i, j)
return Candidate(start, stop, str)
@property
def start(self):
'''The start index'''
return self._start
@property
def stop(self):
'''The stop index'''
return self._stop
def getcandidates(sentence):
'''Yields Candidates'''
if '(' in sentence:
# Check some things first
if sentence.count('(') != sentence.count(')'):
raise ValueError('Unbalanced parentheses: %s' %sentence)
if sentence.find('(') > sentence.find(')'):
raise ValueError('First parentheses is right: %s' %sentence)
closeindex = -1
while 1:
# Look for open parenthesis
openindex = sentence.find('(', closeindex+1)
if openindex == -1: break
# Look for closing parantheses
closeindex = openindex+1
open=1
skip=False
while open:
try:
char = sentence[closeindex]
except IndexError:
# We found an opening bracket but no associated closing bracket
# Skip the opening bracket
skip=True
break
if char == '(':
open +=1
elif char == ')':
open -=1
closeindex+=1
if skip:
closeindex = openindex+1
continue
# Output if conditions are met
start = openindex+1
stop = closeindex-1
str = sentence[start:stop]
# Take into account whitepsace that should be removed
start = start + len(str) - len(str.lstrip())
stop = stop - len(str) + len(str.rstrip())
str = sentence[start:stop]
if conditions(str):
yield Candidate(start, stop, str)
def conditions(str):
'''Based on Schwartz&Hearst
2 <= len(str) <= 10
len(tokens) <= 2
re.search('[A-Za-z]', str)
str[0].isalnum()
and extra:
if it matches ([A-Za-z]\. ?){2,}
it is a good candidate.
'''
if not __REPRODUCE__ and re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
return True
if len(str) < 2 or len(str) > 10:
return False
if len(str.split()) > 2:
return False
if not re.search('[A-Za-z]', str):
return False
if not str[0].isalnum():
return False
return True
def getdefinition(candidate, sentence):
'''Takes a candidate and a sentence and returns the definition candidate.
The definintion candidate is the set of tokens (in front of the candidate)
that starts with a token starting with the first character of the candidate'''
# Take the tokens in front of the candidate
tokens = sentence[:candidate.start-2].lower().split()
# the char that we are looking for
key = candidate[0].lower()
# Count the number of tokens that start with the same character as the candidate
firstchars = [t[0] for t in tokens]
definitionfreq = firstchars.count(key)
candidatefreq = candidate.lower().count(key)
# Look for the list of tokens in front of candidate that
# have a sufficient number of tokens starting with key
if candidatefreq <= definitionfreq:
# we should at least have a good number of starts
count=0
start=0
startindex=len(firstchars)-1
while count < candidatefreq:
if abs(start) > len(firstchars):
raise ValueError('not found')
start-=1
# Look up key in the definition
try:
startindex = firstchars.index(key, len(firstchars)+start)
except ValueError:
pass
# Count the number of keys in definition
count = firstchars[startindex:].count(key)
# We found enough keys in the definition so return the definition as a
# definition candidate
start = len(' '.join(tokens[:startindex]))
stop = candidate.start-2
str = sentence[start:stop]
# Remove whitespace
start = start + len(str) - len(str.lstrip())
stop = stop - len(str) + len(str.rstrip())
str = sentence[start:stop]
return Candidate(start, stop, str)
else:
#print 'S', sentence
#print >>sys.stderr, 'KEY', key
#print >>sys.stderr, 'TOKENS', tokens
#print >>sys.stderr, 'ABBREV', candidate
raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
def definitionselection(definition, abbrev):
'''Takes a definition candidate and an abbreviation candidate
and returns True if the chars in the abbreviation occur in the definition
Based on
A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''
if len(definition) < len(abbrev):
raise ValueError('Abbreviation is longer than definition')
if abbrev in definition.split():
raise ValueError('Abbreviation is full word of definition')
sindex = -1
lindex = -1
while 1:
try:
longchar = definition[lindex].lower()
except IndexError:
#print definition, '||',abbrev
raise
shortchar = abbrev[sindex].lower()
if not shortchar.isalnum():
sindex-=1
if sindex == -1*len(abbrev):
if shortchar == longchar:
if lindex == -1*len(definition) or not definition[lindex-1].isalnum():
break
else:
lindex-=1
else:
lindex-=1
if lindex == -1*(len(definition)+1):
raise ValueError('definition of "%s" not found in "%s"' %(abbrev, definition))
else:
if shortchar == longchar:
sindex -=1
lindex -=1
else:
lindex -= 1
definition = definition[lindex:len(definition)]
tokens = len(definition.split())
length = len(abbrev)
if tokens > min([length+5, length*2]):
raise ValueError('did not meet min(|A|+5, |A|*2) constraint')
# Do not return definitions that contain unbalanced parentheses
if not __REPRODUCE__:
#print 'ccc', abbrev, definition, definition.count('('), definition.count(')')
if definition.count('(') != definition.count(')'):
raise ValueError('Unbalanced parentheses not allowed in a definition')
return definition
def main(fname, verbose=False, encoding=encoding):
'''Writes a file (fname.abb) containing all abbreviations and their definitions in the format
sentenceindex startindex stopindex abbrev
sentenceindex startindex stopindex definition
sentenceindex startindex stopindex abbrev
sentenceindex startindex stopindex definition
...
fname: single-whitespaced tokenized sentences; every sentence on a newline
Evaluation
abbreviations.score('abbrev.txt.gold', 'abbrev.txt.raw.abb')
TP: 673
FP: 94
FN: 280
P : 87.74
R : 70.62
F1: 78.26
abbrev.txt.gold: all gold links in yeast_abbrev_labeled.txt (but as corrected in abbrev.txt)
abbrev.txt.raw.abb: all links based on abbrev.txt.raw and extractedwith main()
abbrev.txt: the sentences from yeast_abbrev_labeled.txt with at least one (; tokenized with tokenizer.py and corrected.
'''
#name = fname+'.abb'
#o = open(name, 'w')
omit=0
written=0
try:
for i, sentence in enumerate(fread(fname)):
try:
for candidate in getcandidates(sentence):
try:
definition = getdefinition(candidate, sentence)
except ValueError, e:
#print >>sys.stderr, i, sentence
if verbose:
print >>sys.stderr, i, 'Omitting candidate', candidate.encode(encoding)
print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
omit+=1
else:
try:
definition = definitionselection(definition, candidate)
except IndexError:
if verbose:
#print >>sys.stderr, i, sentence
print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding), '\n'
omit+=1
except ValueError, e:
if verbose:
#print >>sys.stderr, i, sentence
print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding)
print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
omit+=1
else:
#o.write('%d %d %d %s\n' %(i, candidate.start, candidate.stop, candidate))
#o.write('%d %d %d %s\n' %(i, definition.start, definition.stop, definition))
#o.write('\n')
cline = '%d %d %d %s' %(i, candidate.start, candidate.stop, candidate)
dline = '%d %d %d %s' %(i, definition.start, definition.stop, definition)
print cline.encode(encoding)
print dline.encode(encoding)
print
written+=1
except ValueError, e:
if verbose:
#print >>sys.stderr, 'SENTENCE %d:' %i, sentence.encode(encoding)
#print >>sys.stderr, 'ERROR:', e.args
print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
finally:
1
#o.close()
print >>sys.stderr, 'INFO: %d abbreviations detected and kept (%d omitted)' %(written, omit)
#print >>sys.stderr, 'Written', name
def score(gold, pred):
'''
gold: a file as created with extract.write()
pred: a file as created with main()
Prints the resulst'''
golds={}
s=[]
for line in fread2(gold):
if line:
s.append(' '.join(line[1:]))
id = line[0]
elif s:
golds[id] = golds.get(id, []) + [tuple(s)]
s=[]
preds={}
s=[]
for line in fread2(pred):
if line:
s.append(' '.join(line[3:]))
id = line[0]
elif s:
preds[id] = preds.get(id, []) + [tuple(s)]
s=[]
tp=0; fp=0; fn=0
for id, gvalues in golds.items():
for pair in gvalues:
try:
pvalues = preds[id]
except KeyError:
fn+=1
else:
if pair in pvalues:
tp+=1
preds[id].remove(pair)
else:
print 'false negative', id, pair
fn+=1
for id, pvalues in preds.items():
fp += len(pvalues)
#The number of gold abbreviatios
total=0
for v in golds.values():
total += len(v)
print '''
TP: %d
FP: %d
FN: %d
P : %5.2f
R : %5.2f
F1: %5.2f
In total there were %d gold definition/abbreviation pairs.
''' %(tp, fp, fn, precision(tp, fp), recall(tp, fn), fscore(tp, fp, fn), total)
def recall(TP, FN):
if TP == 0: return 0
return 100*float(TP)/(TP+FN)
def precision(TP, FP):
if TP == 0: return 0
return 100*float(TP)/(TP+FP)
def fscore(TP, FP, FN):
R = recall(TP, FN)
P = precision(TP, FP)
if R==0 or P==0:
return 0
return 2*R*P/(R+P)
def _usage():
print >>sys.stderr, __doc__
if __name__ == '__main__':
try:
opts,args=getopt.getopt(sys.argv[1:],'hv', ['help'])
except getopt.GetoptError:
# print help information and exit:
_usage()
sys.exit(2)
verbose=False
for o, a in opts:
if o in ('-h', '--help'):
_usage()
sys.exit()
if o == '-v':
verbose=True
if len(args) != 1:
_usage()
sys.exit(1)
main(args[0], verbose=verbose)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment