renaud/acronyms.py

## acronyms.py
#!/usr/bin/env python
'''Link abbreviations to their full names

Based on

A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
A. Schwartz and M. Hearst
Biocomputing, 2003, pp 451-462.


# License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt
'''
__date__ = 'July 2012'
__author__ = 'Vincent Van Asch'
__version__ = '1.2.1'


# If you want to use the script that produces the results reported in __doc__
# you have to set the global __REPRODUCE__ to True. It is assumed that
# the extra code that is active when __REPRODUCE__ is set to True produces
# better results.
__REPRODUCE__=False


__doc__='''
This script takes tokenized sentences and prints the abbreviations and their definitions to STDOUT.

USAGE
    python [-v] abbreviations.py inputfile > abbreviations

FORMATS
    inputfile: A tokenized sentence on every new line (tokens should be separated by a single space)
    abbreviations:
        sentenceid1 characteroffset_begin_a1 characteroffset_end_a1 abbreviation1
        sentenceid1 characteroffset_begin_d1 characteroffset_end_d1 definition1

        sentenceid2 characteroffset_begin2 characteroffset_end2 abbreviation2
        sentenceid2 characteroffset_begin2 characteroffset_end2 definition2
        ...

        sentenceid: the line on which the sentence is in the inputfile minus one

OPTIONS
    -v: Print more information to STDERR

REMARKS
    The algorithm detects only links between an abbreviation and its definition if they
    are in the format <definition> (<abbreviation>). So, the reverse
    <abbreviation> (<definition>) is not detected.

    The algorithm can only find definitions that have all characters of the abbreviation
    in them.

    It will also make errors in cases like "tyrosine transcription factor I (TTFI)"
    the wrong definition will be "transcription factor I" because all characters from
    the abbreviation are in the definition.

    On the labeled yeast corpus (http://biotext.berkeley.edu/data.html) version 1.0 of this
    script reaches:

    TP: 673
    FP: 94
    FN: 280

    P : 87.74
    R : 70.62
    F1: 78.26

    (The corpus had to be corrected manually in order to be useable.)

ACKNOWLEDGEMENTS
    Based on:
    A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
    A. Schwartz and M. Hearst
    Biocomputing, 2003, pp 451-462.

%s (version %s)''' %(__date__, __version__)

import os, sys, re, getopt

encoding='UTF8'

def fread(fname, encoding=encoding):
    f=open(os.path.abspath(os.path.expanduser(fname)))
    try:
        for l in f:
                line=l.strip()
                if line:
                    yield line.decode(encoding)
    finally:
        f.close()


def fread2(fname, encoding=encoding):
    f=open(os.path.abspath(os.path.expanduser(fname)))
    try:
        for l in f:
                line=l.strip().decode(encoding)
                if line:
                    yield line.split()
                else:
                    yield None
    finally:
        f.close()

class Candidate(unicode):
    def __new__(cls, start, stop, str):
        return unicode.__new__(cls, str)
    def __init__(self, start, stop, str):
        self._start = start
        self._stop = stop

    def __getslice__(self, i, j):
        start = self.start+i
        stop = self.start+j
        str = unicode.__getslice__(self, i, j)
        return Candidate(start, stop, str)

    @property
    def start(self):
        '''The start index'''
        return self._start
    @property
    def stop(self):
        '''The stop index'''
        return self._stop


def getcandidates(sentence):
    '''Yields Candidates'''
    if '(' in sentence:
        # Check some things first
        if sentence.count('(') != sentence.count(')'):
            raise ValueError('Unbalanced parentheses: %s' %sentence)

        if sentence.find('(') > sentence.find(')'):
            raise ValueError('First parentheses is right: %s' %sentence)

        closeindex = -1
        while 1:
            # Look for open parenthesis
            openindex = sentence.find('(', closeindex+1)

            if openindex == -1: break

            # Look for closing parantheses
            closeindex = openindex+1
            open=1
            skip=False
            while open:
                try:
                    char = sentence[closeindex]
                except IndexError:
                    # We found an opening bracket but no associated closing bracket
                    # Skip the opening bracket
                    skip=True
                    break
                if char == '(':
                    open +=1
                elif char == ')':
                    open -=1
                closeindex+=1

            if skip:
                closeindex = openindex+1
                continue

            # Output if conditions are met
            start = openindex+1
            stop = closeindex-1
            str = sentence[start:stop]

            # Take into account whitepsace that should be removed
            start = start + len(str) - len(str.lstrip())
            stop = stop - len(str) + len(str.rstrip())
            str = sentence[start:stop]

            if conditions(str):
                yield Candidate(start, stop, str)


def conditions(str):
    '''Based on Schwartz&Hearst

    2 <= len(str) <= 10
    len(tokens) <= 2
    re.search('[A-Za-z]', str)
    str[0].isalnum()

    and extra:
    if it matches ([A-Za-z]\. ?){2,}
    it is a good candidate.

    '''
    if not __REPRODUCE__ and re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
        return True
    if len(str) < 2 or len(str) > 10:
        return False
    if len(str.split()) > 2:
        return False
    if not re.search('[A-Za-z]', str):
        return False
    if not str[0].isalnum():
        return False

    return True

def getdefinition(candidate, sentence):
    '''Takes a candidate and a sentence and returns the definition candidate.

    The definintion candidate is the set of tokens (in front of the candidate)
    that starts with a token starting with the first character of the candidate'''
    # Take the tokens in front of the candidate
    tokens = sentence[:candidate.start-2].lower().split()

    # the char that we are looking for
    key = candidate[0].lower()

    # Count the number of tokens that start with the same character as the candidate
    firstchars = [t[0] for t in tokens]

    definitionfreq = firstchars.count(key)
    candidatefreq = candidate.lower().count(key)

    # Look for the list of tokens in front of candidate that
    # have a sufficient number of tokens starting with key
    if candidatefreq <= definitionfreq:
        # we should at least have a good number of starts
        count=0
        start=0
        startindex=len(firstchars)-1
        while count < candidatefreq:
            if abs(start) > len(firstchars):
                raise ValueError('not found')

            start-=1
            # Look up key in the definition
            try:
                startindex = firstchars.index(key, len(firstchars)+start)
            except ValueError:
                pass

            # Count the number of keys in definition
            count = firstchars[startindex:].count(key)

        # We found enough keys in the definition so return the definition as a
        # definition candidate
        start = len(' '.join(tokens[:startindex]))
        stop = candidate.start-2
        str = sentence[start:stop]

        # Remove whitespace
        start = start + len(str) - len(str.lstrip())
        stop = stop - len(str) + len(str.rstrip())
        str = sentence[start:stop]

        return Candidate(start, stop, str)


    else:
        #print 'S', sentence
        #print >>sys.stderr, 'KEY', key
        #print >>sys.stderr, 'TOKENS', tokens
        #print >>sys.stderr, 'ABBREV', candidate
        raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')


def definitionselection(definition, abbrev):
    '''Takes a definition candidate and an abbreviation candidate
    and returns True if the chars in the abbreviation occur in the definition

    Based on
    A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''

    if len(definition) < len(abbrev):
        raise ValueError('Abbreviation is longer than definition')

    if abbrev in definition.split():
        raise ValueError('Abbreviation is full word of definition')

    sindex = -1
    lindex = -1

    while 1:
        try:
            longchar = definition[lindex].lower()
        except IndexError:
            #print definition, '||',abbrev
            raise


        shortchar = abbrev[sindex].lower()

        if not shortchar.isalnum():
            sindex-=1

        if sindex == -1*len(abbrev):
            if shortchar == longchar:
                if lindex == -1*len(definition) or not definition[lindex-1].isalnum():
                    break
                else:
                    lindex-=1
            else:
                lindex-=1

                if lindex == -1*(len(definition)+1):
                    raise ValueError('definition of "%s" not found in "%s"' %(abbrev, definition))

        else:
            if shortchar == longchar:
                sindex -=1
                lindex -=1
            else:
                lindex -= 1

    definition =  definition[lindex:len(definition)]

    tokens = len(definition.split())
    length = len(abbrev)

    if tokens > min([length+5, length*2]):
        raise ValueError('did not meet min(|A|+5, |A|*2) constraint')

    # Do not return definitions that contain unbalanced parentheses
    if not __REPRODUCE__:
        #print 'ccc', abbrev, definition, definition.count('('), definition.count(')')
        if definition.count('(') != definition.count(')'):
            raise ValueError('Unbalanced parentheses not allowed in a definition')

    return definition


def main(fname, verbose=False, encoding=encoding):
    '''Writes a file (fname.abb) containing all abbreviations and their definitions in the format

    sentenceindex startindex stopindex abbrev
    sentenceindex startindex stopindex definition

    sentenceindex startindex stopindex abbrev
    sentenceindex startindex stopindex definition
    ...


    fname: single-whitespaced tokenized sentences; every sentence on a newline


    Evaluation
    abbreviations.score('abbrev.txt.gold', 'abbrev.txt.raw.abb')

    TP: 673
    FP: 94
    FN: 280

    P : 87.74
    R : 70.62
    F1: 78.26


    abbrev.txt.gold: all gold links in yeast_abbrev_labeled.txt (but as corrected in abbrev.txt)
    abbrev.txt.raw.abb: all links based on abbrev.txt.raw and extractedwith main()
    abbrev.txt: the sentences from yeast_abbrev_labeled.txt with at least one (; tokenized with tokenizer.py and corrected.
    '''
    #name = fname+'.abb'
    #o = open(name, 'w')

    omit=0
    written=0
    try:
        for i, sentence in enumerate(fread(fname)):
            try:
                for candidate in getcandidates(sentence):
                    try:
                        definition = getdefinition(candidate, sentence)
                    except ValueError, e:
                        #print >>sys.stderr, i, sentence
                        if verbose:
                            print >>sys.stderr, i, 'Omitting candidate', candidate.encode(encoding)
                            print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
                        omit+=1
                    else:
                        try:
                            definition = definitionselection(definition, candidate)
                        except IndexError:
                            if verbose:
                                #print >>sys.stderr, i, sentence
                                print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding), '\n'
                            omit+=1
                        except ValueError, e:
                            if verbose:
                                #print >>sys.stderr, i, sentence
                                print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding)
                                print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
                            omit+=1
                        else:
                            #o.write('%d %d %d %s\n' %(i, candidate.start, candidate.stop, candidate))
                            #o.write('%d %d %d %s\n' %(i, definition.start, definition.stop, definition))
                            #o.write('\n')

                            cline = '%d %d %d %s' %(i, candidate.start, candidate.stop, candidate)
                            dline =  '%d %d %d %s' %(i, definition.start, definition.stop, definition)

                            print cline.encode(encoding)
                            print dline.encode(encoding)
                            print

                            written+=1
            except ValueError, e:
                if verbose:
                    #print >>sys.stderr, 'SENTENCE %d:' %i, sentence.encode(encoding)
                    #print >>sys.stderr, 'ERROR:', e.args
                    print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'


    finally:
        1
        #o.close()

    print >>sys.stderr, 'INFO: %d abbreviations detected and kept (%d omitted)' %(written, omit)

    #print >>sys.stderr, 'Written', name


def score(gold, pred):
    '''
    gold: a file as created with extract.write()
    pred: a file as created with main()

    Prints the resulst'''
    golds={}
    s=[]
    for line in fread2(gold):
        if line:
            s.append(' '.join(line[1:]))
            id = line[0]
        elif s:
            golds[id] = golds.get(id, []) + [tuple(s)]
            s=[]

    preds={}
    s=[]
    for line in fread2(pred):
        if line:
            s.append(' '.join(line[3:]))
            id = line[0]
        elif s:
            preds[id] = preds.get(id, []) + [tuple(s)]
            s=[]


    tp=0; fp=0; fn=0
    for id, gvalues in golds.items():
        for pair in gvalues:
            try:
                pvalues = preds[id]
            except KeyError:
                fn+=1
            else:
                if pair in pvalues:
                    tp+=1
                    preds[id].remove(pair)
                else:
                    print 'false negative', id, pair
                    fn+=1


    for id, pvalues in preds.items():
        fp += len(pvalues)


    #The number of gold abbreviatios
    total=0
    for v in golds.values():
        total += len(v)


    print '''
TP: %d
FP: %d
FN: %d

P : %5.2f
R : %5.2f
F1: %5.2f

In total there were %d gold definition/abbreviation pairs.

''' %(tp, fp, fn, precision(tp, fp), recall(tp, fn), fscore(tp, fp, fn), total)


def recall(TP, FN):
    if TP == 0: return 0
    return 100*float(TP)/(TP+FN)

def precision(TP, FP):
    if TP == 0: return 0
    return 100*float(TP)/(TP+FP)

def fscore(TP, FP, FN):
    R = recall(TP, FN)
    P = precision(TP, FP)

    if R==0 or P==0:
        return 0

    return 2*R*P/(R+P)


def _usage():
    print >>sys.stderr, __doc__


if __name__ == '__main__':
    try:
        opts,args=getopt.getopt(sys.argv[1:],'hv', ['help'])
    except getopt.GetoptError:
        # print help information and exit:
        _usage()
        sys.exit(2)


    verbose=False

    for o, a in opts:
        if o in ('-h', '--help'):
            _usage()
            sys.exit()
        if o == '-v':
            verbose=True


    if len(args) != 1:
        _usage()
        sys.exit(1)


    main(args[0], verbose=verbose)
	#!/usr/bin/env python
	'''Link abbreviations to their full names

	Based on

	A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
	A. Schwartz and M. Hearst
	Biocomputing, 2003, pp 451-462.


	# License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt
	'''
	__date__ = 'July 2012'
	__author__ = 'Vincent Van Asch'
	__version__ = '1.2.1'


	# If you want to use the script that produces the results reported in __doc__
	# you have to set the global __REPRODUCE__ to True. It is assumed that
	# the extra code that is active when __REPRODUCE__ is set to True produces
	# better results.
	__REPRODUCE__=False


	__doc__='''
	This script takes tokenized sentences and prints the abbreviations and their definitions to STDOUT.

	USAGE
	python [-v] abbreviations.py inputfile > abbreviations

	FORMATS
	inputfile: A tokenized sentence on every new line (tokens should be separated by a single space)
	abbreviations:
	sentenceid1 characteroffset_begin_a1 characteroffset_end_a1 abbreviation1
	sentenceid1 characteroffset_begin_d1 characteroffset_end_d1 definition1

	sentenceid2 characteroffset_begin2 characteroffset_end2 abbreviation2
	sentenceid2 characteroffset_begin2 characteroffset_end2 definition2
	...

	sentenceid: the line on which the sentence is in the inputfile minus one

	OPTIONS
	-v: Print more information to STDERR

	REMARKS
	The algorithm detects only links between an abbreviation and its definition if they
	are in the format <definition> (<abbreviation>). So, the reverse
	<abbreviation> (<definition>) is not detected.

	The algorithm can only find definitions that have all characters of the abbreviation
	in them.

	It will also make errors in cases like "tyrosine transcription factor I (TTFI)"
	the wrong definition will be "transcription factor I" because all characters from
	the abbreviation are in the definition.

	On the labeled yeast corpus (http://biotext.berkeley.edu/data.html) version 1.0 of this
	script reaches:

	TP: 673
	FP: 94
	FN: 280

	P : 87.74
	R : 70.62
	F1: 78.26

	(The corpus had to be corrected manually in order to be useable.)

	ACKNOWLEDGEMENTS
	Based on:
	A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
	A. Schwartz and M. Hearst
	Biocomputing, 2003, pp 451-462.

	%s (version %s)''' %(__date__, __version__)

	import os, sys, re, getopt

	encoding='UTF8'

	def fread(fname, encoding=encoding):
	f=open(os.path.abspath(os.path.expanduser(fname)))
	try:
	for l in f:
	line=l.strip()
	if line:
	yield line.decode(encoding)
	finally:
	f.close()


	def fread2(fname, encoding=encoding):
	f=open(os.path.abspath(os.path.expanduser(fname)))
	try:
	for l in f:
	line=l.strip().decode(encoding)
	if line:
	yield line.split()
	else:
	yield None
	finally:
	f.close()

	class Candidate(unicode):
	def __new__(cls, start, stop, str):
	return unicode.__new__(cls, str)
	def __init__(self, start, stop, str):
	self._start = start
	self._stop = stop

	def __getslice__(self, i, j):
	start = self.start+i
	stop = self.start+j
	str = unicode.__getslice__(self, i, j)
	return Candidate(start, stop, str)

	@property
	def start(self):
	'''The start index'''
	return self._start
	@property
	def stop(self):
	'''The stop index'''
	return self._stop




	def getcandidates(sentence):
	'''Yields Candidates'''
	if '(' in sentence:
	# Check some things first
	if sentence.count('(') != sentence.count(')'):
	raise ValueError('Unbalanced parentheses: %s' %sentence)

	if sentence.find('(') > sentence.find(')'):
	raise ValueError('First parentheses is right: %s' %sentence)

	closeindex = -1
	while 1:
	# Look for open parenthesis
	openindex = sentence.find('(', closeindex+1)

	if openindex == -1: break

	# Look for closing parantheses
	closeindex = openindex+1
	open=1
	skip=False
	while open:
	try:
	char = sentence[closeindex]
	except IndexError:
	# We found an opening bracket but no associated closing bracket
	# Skip the opening bracket
	skip=True
	break
	if char == '(':
	open +=1
	elif char == ')':
	open -=1
	closeindex+=1

	if skip:
	closeindex = openindex+1
	continue

	# Output if conditions are met
	start = openindex+1
	stop = closeindex-1
	str = sentence[start:stop]

	# Take into account whitepsace that should be removed
	start = start + len(str) - len(str.lstrip())
	stop = stop - len(str) + len(str.rstrip())
	str = sentence[start:stop]

	if conditions(str):
	yield Candidate(start, stop, str)






	def conditions(str):
	'''Based on Schwartz&Hearst

	2 <= len(str) <= 10
	len(tokens) <= 2
	re.search('[A-Za-z]', str)
	str[0].isalnum()

	and extra:
	if it matches ([A-Za-z]\. ?){2,}
	it is a good candidate.

	'''
	if not __REPRODUCE__ and re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
	return True
	if len(str) < 2 or len(str) > 10:
	return False
	if len(str.split()) > 2:
	return False
	if not re.search('[A-Za-z]', str):
	return False
	if not str[0].isalnum():
	return False

	return True

	def getdefinition(candidate, sentence):
	'''Takes a candidate and a sentence and returns the definition candidate.

	The definintion candidate is the set of tokens (in front of the candidate)
	that starts with a token starting with the first character of the candidate'''
	# Take the tokens in front of the candidate
	tokens = sentence[:candidate.start-2].lower().split()

	# the char that we are looking for
	key = candidate[0].lower()

	# Count the number of tokens that start with the same character as the candidate
	firstchars = [t[0] for t in tokens]

	definitionfreq = firstchars.count(key)
	candidatefreq = candidate.lower().count(key)

	# Look for the list of tokens in front of candidate that
	# have a sufficient number of tokens starting with key
	if candidatefreq <= definitionfreq:
	# we should at least have a good number of starts
	count=0
	start=0
	startindex=len(firstchars)-1
	while count < candidatefreq:
	if abs(start) > len(firstchars):
	raise ValueError('not found')

	start-=1
	# Look up key in the definition
	try:
	startindex = firstchars.index(key, len(firstchars)+start)
	except ValueError:
	pass

	# Count the number of keys in definition
	count = firstchars[startindex:].count(key)

	# We found enough keys in the definition so return the definition as a
	# definition candidate
	start = len(' '.join(tokens[:startindex]))
	stop = candidate.start-2
	str = sentence[start:stop]

	# Remove whitespace
	start = start + len(str) - len(str.lstrip())
	stop = stop - len(str) + len(str.rstrip())
	str = sentence[start:stop]

	return Candidate(start, stop, str)


	else:
	#print 'S', sentence
	#print >>sys.stderr, 'KEY', key
	#print >>sys.stderr, 'TOKENS', tokens
	#print >>sys.stderr, 'ABBREV', candidate
	raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')



	def definitionselection(definition, abbrev):
	'''Takes a definition candidate and an abbreviation candidate
	and returns True if the chars in the abbreviation occur in the definition

	Based on
	A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''

	if len(definition) < len(abbrev):
	raise ValueError('Abbreviation is longer than definition')

	if abbrev in definition.split():
	raise ValueError('Abbreviation is full word of definition')

	sindex = -1
	lindex = -1

	while 1:
	try:
	longchar = definition[lindex].lower()
	except IndexError:
	#print definition, '\|\|',abbrev
	raise


	shortchar = abbrev[sindex].lower()

	if not shortchar.isalnum():
	sindex-=1

	if sindex == -1*len(abbrev):
	if shortchar == longchar:
	if lindex == -1*len(definition) or not definition[lindex-1].isalnum():
	break
	else:
	lindex-=1
	else:
	lindex-=1

	if lindex == -1*(len(definition)+1):
	raise ValueError('definition of "%s" not found in "%s"' %(abbrev, definition))

	else:
	if shortchar == longchar:
	sindex -=1
	lindex -=1
	else:
	lindex -= 1

	definition = definition[lindex:len(definition)]

	tokens = len(definition.split())
	length = len(abbrev)

	if tokens > min([length+5, length*2]):
	raise ValueError('did not meet min(\|A\|+5, \|A\|*2) constraint')

	# Do not return definitions that contain unbalanced parentheses
	if not __REPRODUCE__:
	#print 'ccc', abbrev, definition, definition.count('('), definition.count(')')
	if definition.count('(') != definition.count(')'):
	raise ValueError('Unbalanced parentheses not allowed in a definition')

	return definition






	def main(fname, verbose=False, encoding=encoding):
	'''Writes a file (fname.abb) containing all abbreviations and their definitions in the format

	sentenceindex startindex stopindex abbrev
	sentenceindex startindex stopindex definition

	sentenceindex startindex stopindex abbrev
	sentenceindex startindex stopindex definition
	...


	fname: single-whitespaced tokenized sentences; every sentence on a newline


	Evaluation
	abbreviations.score('abbrev.txt.gold', 'abbrev.txt.raw.abb')

	TP: 673
	FP: 94
	FN: 280

	P : 87.74
	R : 70.62
	F1: 78.26


	abbrev.txt.gold: all gold links in yeast_abbrev_labeled.txt (but as corrected in abbrev.txt)
	abbrev.txt.raw.abb: all links based on abbrev.txt.raw and extractedwith main()
	abbrev.txt: the sentences from yeast_abbrev_labeled.txt with at least one (; tokenized with tokenizer.py and corrected.
	'''
	#name = fname+'.abb'
	#o = open(name, 'w')

	omit=0
	written=0
	try:
	for i, sentence in enumerate(fread(fname)):
	try:
	for candidate in getcandidates(sentence):
	try:
	definition = getdefinition(candidate, sentence)
	except ValueError, e:
	#print >>sys.stderr, i, sentence
	if verbose:
	print >>sys.stderr, i, 'Omitting candidate', candidate.encode(encoding)
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
	omit+=1
	else:
	try:
	definition = definitionselection(definition, candidate)
	except IndexError:
	if verbose:
	#print >>sys.stderr, i, sentence
	print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '\|\|', candidate.encode(encoding), '\n'
	omit+=1
	except ValueError, e:
	if verbose:
	#print >>sys.stderr, i, sentence
	print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '\|\|', candidate.encode(encoding)
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
	omit+=1
	else:
	#o.write('%d %d %d %s\n' %(i, candidate.start, candidate.stop, candidate))
	#o.write('%d %d %d %s\n' %(i, definition.start, definition.stop, definition))
	#o.write('\n')

	cline = '%d %d %d %s' %(i, candidate.start, candidate.stop, candidate)
	dline = '%d %d %d %s' %(i, definition.start, definition.stop, definition)

	print cline.encode(encoding)
	print dline.encode(encoding)
	print

	written+=1
	except ValueError, e:
	if verbose:
	#print >>sys.stderr, 'SENTENCE %d:' %i, sentence.encode(encoding)
	#print >>sys.stderr, 'ERROR:', e.args
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'


	finally:
	1
	#o.close()

	print >>sys.stderr, 'INFO: %d abbreviations detected and kept (%d omitted)' %(written, omit)

	#print >>sys.stderr, 'Written', name



	def score(gold, pred):
	'''
	gold: a file as created with extract.write()
	pred: a file as created with main()

	Prints the resulst'''
	golds={}
	s=[]
	for line in fread2(gold):
	if line:
	s.append(' '.join(line[1:]))
	id = line[0]
	elif s:
	golds[id] = golds.get(id, []) + [tuple(s)]
	s=[]

	preds={}
	s=[]
	for line in fread2(pred):
	if line:
	s.append(' '.join(line[3:]))
	id = line[0]
	elif s:
	preds[id] = preds.get(id, []) + [tuple(s)]
	s=[]


	tp=0; fp=0; fn=0
	for id, gvalues in golds.items():
	for pair in gvalues:
	try:
	pvalues = preds[id]
	except KeyError:
	fn+=1
	else:
	if pair in pvalues:
	tp+=1
	preds[id].remove(pair)
	else:
	print 'false negative', id, pair
	fn+=1


	for id, pvalues in preds.items():
	fp += len(pvalues)


	#The number of gold abbreviatios
	total=0
	for v in golds.values():
	total += len(v)



	print '''
	TP: %d
	FP: %d
	FN: %d

	P : %5.2f
	R : %5.2f
	F1: %5.2f

	In total there were %d gold definition/abbreviation pairs.

	''' %(tp, fp, fn, precision(tp, fp), recall(tp, fn), fscore(tp, fp, fn), total)


	def recall(TP, FN):
	if TP == 0: return 0
	return 100*float(TP)/(TP+FN)

	def precision(TP, FP):
	if TP == 0: return 0
	return 100*float(TP)/(TP+FP)

	def fscore(TP, FP, FN):
	R = recall(TP, FN)
	P = precision(TP, FP)

	if R==0 or P==0:
	return 0

	return 2RP/(R+P)




	def _usage():
	print >>sys.stderr, __doc__


	if __name__ == '__main__':
	try:
	opts,args=getopt.getopt(sys.argv[1:],'hv', ['help'])
	except getopt.GetoptError:
	# print help information and exit:
	_usage()
	sys.exit(2)


	verbose=False

	for o, a in opts:
	if o in ('-h', '--help'):
	_usage()
	sys.exit()
	if o == '-v':
	verbose=True


	if len(args) != 1:
	_usage()
	sys.exit(1)


	main(args[0], verbose=verbose)