kylebgorman/A.TextGrid

## A.TextGrid
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0
xmax = 3
tiers? <exists>
size = 1
item []:
    item [1]:
        class = "IntervalTier"
        name = "word"
        xmin = 0
        xmax = 3
        intervals: size = 5
        intervals [1]:
            xmin = 0
            xmax = 0.46004543779933693
            text = ""
        intervals [2]:
            xmin = 0.46004543779933693
            xmax = 1.1252957550452332
            text = "THAN"
        intervals [3]:
            xmin = 1.1252957550452332
            xmax = 1.966877481681608
            text = "I"
        intervals [4]:
            xmin = 1.966877481681608
            xmax = 2.5920524786114862
            text = "DID"
        intervals [5]:
            xmin = 2.5920524786114862
            xmax = 3
            text = ""

## alignment.py
#!/usr/bin/env python
#
# Copyright (c) 2011 Kyle Gorman
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# alignment.py: Code for scoring hypothesized alignments against a reference
# alignment, using techniques from text segmentation resarch
#
# Kyle Gorman <kgorman@ling.upenn.edu>
#
# This code was developed to evaluate Prosodylab-Aligner, available at:
#
# http://prosodylab.org/tools/aligner/

from textgrid import TextGridFromFile # https://github.com/kylebgorman/textgrid.py


## SCORING METHODS

def Pmu(ref, hyp, dt=0.001):
    """
    P_mu segmentation score, based loosely on:

    D. Beeferman, A. Berger, J. Lafferty. 1997. Text segmentation using
    exponential models. Proceedings of EMNLP.

    It is, to a first approximation, the probability that a non-null segment is
    misaligned.

    >>> ref = TextGridFromFile('A.TextGrid')[0]
    >>> print round(Pmu(ref, TextGridFromFile('B.TextGrid')[0]), 3)
    0.713
    >>> print round(Pmu(ref, TextGridFromFile('C.TextGrid')[0]), 3)
    0.513
    """
    c = 0
    d = 0
    t = ref[0].minTime
    stop = ref[-1].maxTime - dt
    while t <= stop:
        ri = ref.intervalContaining(t)
        if ri and ri.mark == hyp.intervalContaining(t).mark:
            c += 1
        else:
            d += 1
        t += dt
    return c / float(c + d)


def Pk(ref, hyp, dt=0.001):
    """
    P_k segmentation score, based on:

    D. Beeferman, A. Berger, J. Lafferty. 1999. Statistical models of text
    segmentation. Machine Learning 34(1-3): 177-210.

    It is, to a first approximation, the probability that some span in the
    hypothesized segmentation crosses a segment boundary iff it crosses a
    segment boundary in the reference segmentation.

    >>> ref = TextGridFromFile('A.TextGrid')[0]
    >>> print round(Pk(ref, TextGridFromFile('B.TextGrid')[0]), 3)
    0.63
    >>> print round(Pk(ref, TextGridFromFile('C.TextGrid')[0]), 3)
    0.404
    """
    c = 0
    d = 0
    k = (x.maxTime - x.MinTime for x in ref) / len(ref) / 2.
    t = ref[0].minTime
    stop = ref[-1].maxTime - k
    while t <= stop:
        rs = ref.intervalContaining(t) == ref.intervalContaining(t + k)
        hs = hyp.intervalContaining(t) == hyp.intervalContaining(t + k)
        if rs == hs:
            c += 1
        else:
            d += 1
        t += dt # increment
    return c / float(c + d)


def Ddiff(ref, hyp, dt=0.01):
    """
    1 - WindowDiff segmentation score, based on:

    L. Pevzner, M. A. Hearst. 2002. A critique and improvement of an evaluation
    metric for text segmentation. Computational Linguisics 28(1): 19-36.

    >>> ref = TextGridFromFile('A.TextGrid')[0]
    >>> print round(Ddiff(ref, TextGridFromFile('B.TextGrid')[0]), 3)
    0.631
    >>> print round(Ddiff(ref, TextGridFromFile('C.TextGrid')[0]), 3)
    0.402
    """
    c = 0
    d = 0
    k = mean([x.maxTime - x.minTime for x in ref]) / 2.
    t = ref[0].minTime
    stop = ref[-1].maxTime - k
    while t <= stop:
        rb = ref.indexContaining(t + k) - ref.indexContaining(t)
        hb = hyp.indexContaining(t + k) - hyp.indexContaining(t)
        d += abs(rb - hb) > 0
        c += 1
        t += dt
    return float(c - d) / c


if __name__ == '__main__':
    import doctest
    doctest.testmod()

## B.TextGrid
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0
xmax = 3
tiers? <exists>
size = 1
item []:
    item [1]:
        class = "IntervalTier"
        name = "word"
        xmin = 0
        xmax = 3
        intervals: size = 5
        intervals [1]:
            xmin = 0
            xmax = 0.36004543779933693
            text = ""
        intervals [2]:
            xmin = 0.36004543779933693
            xmax = 1.3252957550452332
            text = "THAN"
        intervals [3]:
            xmin = 1.3252957550452332
            xmax = 1.866877481681608
            text = "I"
        intervals [4]:
            xmin = 1.866877481681608
            xmax = 2.6920524786114862
            text = "DID"
        intervals [5]:
            xmin = 2.6920524786114862
            xmax = 3
            text = ""

## C.TextGrid
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0
xmax = 3
tiers? <exists>
size = 1
item []:
    item [1]:
        class = "IntervalTier"
        name = "word"
        xmin = 0
        xmax = 3
        intervals: size = 5
        intervals [1]:
            xmin = 0
            xmax = 0.06004543779933693
            text = ""
        intervals [2]:
            xmin = 0.06004543779933693
            xmax = 1.6252957550452332
            text = "THAN"
        intervals [3]:
            xmin = 1.6252957550452332
            xmax = 1.766877481681608
            text = "I"
        intervals [4]:
            xmin = 1.766877481681608
            xmax = 2.8920524786114862
            text = "DID"
        intervals [5]:
            xmin = 2.8920524786114862
            xmax = 3
            text = ""
	File type = "ooTextFile"
	Object class = "TextGrid"

	xmin = 0
	xmax = 3
	tiers? <exists>
	size = 1
	item []:
	item [1]:
	class = "IntervalTier"
	name = "word"
	xmin = 0
	xmax = 3
	intervals: size = 5
	intervals [1]:
	xmin = 0
	xmax = 0.46004543779933693
	text = ""
	intervals [2]:
	xmin = 0.46004543779933693
	xmax = 1.1252957550452332
	text = "THAN"
	intervals [3]:
	xmin = 1.1252957550452332
	xmax = 1.966877481681608
	text = "I"
	intervals [4]:
	xmin = 1.966877481681608
	xmax = 2.5920524786114862
	text = "DID"
	intervals [5]:
	xmin = 2.5920524786114862
	xmax = 3
	text = ""
	#!/usr/bin/env python
	#
	# Copyright (c) 2011 Kyle Gorman
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.
	#
	# alignment.py: Code for scoring hypothesized alignments against a reference
	# alignment, using techniques from text segmentation resarch
	#
	# Kyle Gorman <kgorman@ling.upenn.edu>
	#
	# This code was developed to evaluate Prosodylab-Aligner, available at:
	#
	# http://prosodylab.org/tools/aligner/

	from textgrid import TextGridFromFile # https://github.com/kylebgorman/textgrid.py


	## SCORING METHODS

	def Pmu(ref, hyp, dt=0.001):
	"""
	P_mu segmentation score, based loosely on:

	D. Beeferman, A. Berger, J. Lafferty. 1997. Text segmentation using
	exponential models. Proceedings of EMNLP.

	It is, to a first approximation, the probability that a non-null segment is
	misaligned.

	>>> ref = TextGridFromFile('A.TextGrid')[0]
	>>> print round(Pmu(ref, TextGridFromFile('B.TextGrid')[0]), 3)
	0.713
	>>> print round(Pmu(ref, TextGridFromFile('C.TextGrid')[0]), 3)
	0.513
	"""
	c = 0
	d = 0
	t = ref[0].minTime
	stop = ref[-1].maxTime - dt
	while t <= stop:
	ri = ref.intervalContaining(t)
	if ri and ri.mark == hyp.intervalContaining(t).mark:
	c += 1
	else:
	d += 1
	t += dt
	return c / float(c + d)


	def Pk(ref, hyp, dt=0.001):
	"""
	P_k segmentation score, based on:

	D. Beeferman, A. Berger, J. Lafferty. 1999. Statistical models of text
	segmentation. Machine Learning 34(1-3): 177-210.

	It is, to a first approximation, the probability that some span in the
	hypothesized segmentation crosses a segment boundary iff it crosses a
	segment boundary in the reference segmentation.

	>>> ref = TextGridFromFile('A.TextGrid')[0]
	>>> print round(Pk(ref, TextGridFromFile('B.TextGrid')[0]), 3)
	0.63
	>>> print round(Pk(ref, TextGridFromFile('C.TextGrid')[0]), 3)
	0.404
	"""
	c = 0
	d = 0
	k = (x.maxTime - x.MinTime for x in ref) / len(ref) / 2.
	t = ref[0].minTime
	stop = ref[-1].maxTime - k
	while t <= stop:
	rs = ref.intervalContaining(t) == ref.intervalContaining(t + k)
	hs = hyp.intervalContaining(t) == hyp.intervalContaining(t + k)
	if rs == hs:
	c += 1
	else:
	d += 1
	t += dt # increment
	return c / float(c + d)


	def Ddiff(ref, hyp, dt=0.01):
	"""
	1 - WindowDiff segmentation score, based on:

	L. Pevzner, M. A. Hearst. 2002. A critique and improvement of an evaluation
	metric for text segmentation. Computational Linguisics 28(1): 19-36.

	>>> ref = TextGridFromFile('A.TextGrid')[0]
	>>> print round(Ddiff(ref, TextGridFromFile('B.TextGrid')[0]), 3)
	0.631
	>>> print round(Ddiff(ref, TextGridFromFile('C.TextGrid')[0]), 3)
	0.402
	"""
	c = 0
	d = 0
	k = mean([x.maxTime - x.minTime for x in ref]) / 2.
	t = ref[0].minTime
	stop = ref[-1].maxTime - k
	while t <= stop:
	rb = ref.indexContaining(t + k) - ref.indexContaining(t)
	hb = hyp.indexContaining(t + k) - hyp.indexContaining(t)
	d += abs(rb - hb) > 0
	c += 1
	t += dt
	return float(c - d) / c


	if __name__ == '__main__':
	import doctest
	doctest.testmod()